perfect-postcode/pipeline/transform/transform_poi.py

import argparse
from pathlib import Path

import numpy as np
import polars as pl

from pipeline.utils.england_geometry import in_england_mask
from pipeline.utils.normalize import strip_or_empty

DROP_CATEGORIES = {
    # GEOLYTIX Grocery Retail Points is the authoritative supermarket source
    # (transform_grocery_retail_points), so drop OSM supermarkets to avoid
    # double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
    "shop/supermarket",
    # Street furniture & infrastructure
    "amenity/advice",
    "amenity/atm",
    "amenity/bbq",
    "amenity/bench",
    "amenity/bicycle_parking",
    "amenity/binoculars",
    "amenity/boot_scraper",
    "amenity/bus_garage",
    "amenity/check_in",
    "amenity/clock",
    "amenity/clothes_dryer",
    "amenity/coast_guard",
    "amenity/coffin_rest",
    "amenity/compressed_air",
    "amenity/court_yard",
    "amenity/donation_box",
    "amenity/dressing_room",
    "amenity/drinking_water",
    "emergency/water_tank",
    "leisure/bleachers",
    "leisure/schoolyard",
    # Park "furniture" / incidental features — not parks; they massively
    # inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
    "leisure/bandstand",
    "leisure/bird_hide",
    "leisure/firepit",
    "leisure/outdoor_seating",
    "leisure/picnic_table",
    "leisure/wildlife_hide",
    "public_transport/pay_scale_area",
    "shop/taxi",
    "amenity/feeding_place",
    "amenity/fixme",
    "amenity/grit_bin",
    "amenity/hunting_stand",
    "amenity/letter_box",
    "amenity/loading_dock",
    "amenity/lounge",
    "tourism/preserved_railway",
    "amenity/lounger",
    "leisure/sport",
    "amenity/motorcycle_parking",
    "amenity/mounting_block",
    "amenity/notice_board",
    "amenity/parcel_locker",
    "amenity/parking",
    "amenity/parking_entrance",
    "amenity/parking_space",
    "amenity/payment_terminal",
    "amenity/photo_booth",
    "amenity/piano",
    "amenity/post_box",
    "amenity/public_bookcase",
    "amenity/reception_desk",
    "amenity/sanitary_dump_station",
    "amenity/shelter",
    "amenity/shower",
    "amenity/smoking_area",
    "amenity/table",
    "amenity/telephone",
    "amenity/telescope",
    "amenity/ticket_validator",
    "amenity/toilets",
    "amenity/trolley_bay",
    "amenity/vacuum_cleaner",
    "amenity/vending_machine",
    "amenity/washing_machine",
    "amenity/washingline",
    "amenity/waste_basket",
    "amenity/waste_disposal",
    "amenity/waste_transfer_station",
    "amenity/water_point",
    "amenity/watering_place",
    "amenity/weighbridge",
    # Boating/cycle-hire infrastructure formerly miscategorised as
    # "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
    # ramps and moorings are not entertainment venues.
    "amenity/bicycle_rental",
    "amenity/boat_rental",
    "leisure/marina",
    "leisure/slipway",
    # Public art (statues, murals, village signs) formerly 93% of "Gallery".
    "tourism/artwork",
    # Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
    # "Gym & Fitness".
    "leisure/fitness_station",
    # Untyped healthcare rows and non-pharmacy health shops formerly bucketed
    # under "Hospital & Clinic" / "Pharmacy".
    "healthcare/yes",
    "healthcare/alternative",
    "shop/herbalist",
    "shop/health",
    # Street fountains and courthouses formerly bucketed as
    # "Tourist Attraction".
    "amenity/fountain",
    "amenity/courthouse",
    # Niche amenities not useful for home buyers
    "amenity/animal_boarding",
    "amenity/animal_breeding",
    "amenity/animal_shelter",
    "amenity/boat_storage",
    "amenity/bureau_de_change",
    "amenity/bus_station",
    "amenity/beachhut",
    "amenity/canteen",
    "amenity/conference_centre",
    "amenity/crematorium",
    "amenity/disused",
    "amenity/driver_training",
    "amenity/driving_school",
    "amenity/escooter_rental",
    "amenity/ferry_terminal",
    "amenity/grave_yard",
    "amenity/hall",
    "shop/funeral_directors",
    "amenity/kick-scooter_rental",
    "amenity/money_transfer",
    "amenity/post_depot",
    "amenity/prison",
    "amenity/public_building",
    "amenity/recycling",
    "amenity/scout_hut",
    "amenity/social_facility",
    "amenity/studio",
    "amenity/student_accommodation",
    "amenity/taxi",
    "amenity/telephone_exchange",
    "amenity/training",
    "amenity/vehicle_inspection",
    "amenity/waiting_room",
    "amenity/yes",
    "shop/disused",
    "shop/no",
    # Buildings (except church & university which are mapped)
    "building/air_shaft",
    "building/apartments",
    "building/barn",
    "building/bunker",
    "building/chapel",
    "building/commercial",
    "building/construction",
    "building/detached",
    "building/entrance",
    "building/entry",
    "building/farm",
    "building/farm_auxiliary",
    "building/garage",
    "building/garages",
    "building/greenhouse",
    "building/house",
    "building/hut",
    "building/industrial",
    "building/kiosk",
    "building/no",
    "building/office",
    "building/public",
    "building/residential",
    "building/retail",
    "building/roof",
    "building/ruins",
    "building/school",
    "building/semidetached_house",
    "building/service",
    "building/shed",
    "building/terrace",
    "building/warehouse",
    "building/yes",
    # All emergency
    "emergency/access_point",
    "emergency/assembly_point",
    "emergency/bleed_control_kit",
    "emergency/defibrillator",
    "emergency/designated",
    "emergency/dry_riser_inlet",
    "emergency/emergency_ward_entrance",
    "emergency/fire_alarm_box",
    "emergency/fire_extinguisher",
    "emergency/fire_hydrant",
    "emergency/fire_service_inlet",
    "emergency/first_aid_kit",
    "emergency/life_ring",
    "emergency/lifeguard",
    "emergency/no",
    "emergency/phone",
    "emergency/rescue_equipment",
    "emergency/siren",
    "emergency/throw_bag",
    "emergency/water_rescue",
    "emergency/yes",
    "tourism/apartment",
    "tourism/apartments",
    "tourism/alpine_hut",
    "tourism/camp_pitch",
    "tourism/caravan_site",
    "tourism/information",
    "tourism/picnic_site",
    "tourism/viewpoint",
    "tourism/village_sign",
    "tourism/wilderness_hut",
    "tourism/yes",
    # Public transport (from NaPTAN instead). public_transport/platform is the
    # EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
    # authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
    # NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
    # a single stop. stop_position is left dropped to avoid double-counting the
    # same stop (platform + stop_position).
    "public_transport/entrance",
    "public_transport/station",
    "public_transport/stop_position",
    # Education amenities — schools come from GIAS instead. OSM coverage for
    # tertiary education, tutoring, and childcare is too noisy/incomplete to be
    # useful on a property-search map.
    "amenity/school",
    "amenity/prep_school",
    "amenity/language_school",
    "amenity/music_school",
    "amenity/university",
    "amenity/college",
    "building/university",
    "amenity/kindergarten",
    "amenity/childcare",
    "office/tutoring",
}


# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
    (
        "Leisure",
        "Café",
        "☕",
        [
            "amenity/cafe",
            "amenity/ice_cream",
            "amenity/internet_cafe",
        ],
    ),
    (
        "Leisure",
        "Restaurant",
        "🍽️",
        [
            "amenity/restaurant",
            "amenity/food_court",
        ],
    ),
    (
        "Leisure",
        "Pub",
        "🍺",
        [
            "amenity/pub",
            "amenity/beer_garden",
            "amenity/biergarten",
            "amenity/social_club",
            "amenity/club",
            "leisure/social_club",
            "craft/brewery",
            "craft/distillery",
            "craft/winery",
        ],
    ),
    (
        "Leisure",
        "Bar",
        "🍸",
        [
            "amenity/bar",
            "amenity/hookah_lounge",
        ],
    ),
    (
        "Leisure",
        "Fast Food",
        "🍔",
        [
            "amenity/fast_food",
        ],
    ),
    (
        "Leisure",
        "Nightclub",
        "🪩",
        [
            "amenity/nightclub",
            "amenity/stripclub",
            "amenity/casino",
            "amenity/gambling",
        ],
    ),
    (
        "Leisure",
        "Cinema",
        "🎬",
        [
            "amenity/cinema",
        ],
    ),
    (
        "Leisure",
        "Theatre",
        "🎭",
        [
            "amenity/theatre",
        ],
    ),
    (
        "Leisure",
        "Live Music & Events",
        "🎶",
        [
            "amenity/music_venue",
            "amenity/events_venue",
            "leisure/dance",
        ],
    ),
    (
        "Leisure",
        "Park",
        "🌳",
        [
            "leisure/park",
            # leisure/garden is dominated by private residential gardens (98%+
            # unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
            # so only named (public/notable) gardens count as a Park.
            "leisure/garden",
            "leisure/common",
            "leisure/nature_reserve",
            "leisure/dog_park",
        ],
    ),
    (
        "Leisure",
        "Playground",
        "🛝",
        [
            "leisure/playground",
            "leisure/indoor_play",
        ],
    ),
    (
        "Leisure",
        "Sports Centre",
        "🏟️",
        [
            "leisure/sports_centre",
            "leisure/sports_hall",
            # leisure/pitch (73% of the old bucket) and leisure/swimming_pool
            # (98% unnamed = private/garden pools) are name-gated in transform()
            # via REQUIRE_NAME_CATEGORIES so only named public facilities count.
            "leisure/pitch",
            "leisure/track",
            "leisure/golf_course",
            "leisure/miniature_golf",
            "leisure/horse_riding",
            "leisure/fishing",
            "leisure/ice_rink",
            "leisure/paddling_pool",
            "leisure/practice_pitch",
            "leisure/shooting_ground",
            "leisure/stadium",
            "leisure/swimming_pool",
            "leisure/swimming_area",
            "leisure/water_park",
            "leisure/bathing_place",
        ],
    ),
    (
        "Leisure",
        "Entertainment",
        "🎳",
        [
            "leisure/bowling_alley",
            "leisure/amusement_arcade",
            "leisure/adult_gaming_centre",
            "leisure/escape_game",
            "leisure/maze",
            "leisure/trampoline_park",
            "leisure/sauna",
            "leisure/tanning_salon",
            "shop/amusements",
            "tourism/theme_park",
            # bicycle_rental/boat_rental/marina/slipway used to live here and
            # made up ~46% of the bucket (cycle-hire docks, boat ramps); they
            # are infrastructure, not entertainment venues — see DROP_CATEGORIES.
            "leisure/hackerspace",
            "leisure/yes",
        ],
    ),
    (
        "Groceries",
        "Convenience Store",
        "🏪",
        [
            "shop/convenience",
            "shop/general",
            "shop/kiosk",
            "shop/grocery",
        ],
    ),
    (
        "Groceries",
        "Bakery",
        "🥐",
        [
            "shop/bakery",
            "shop/pastry",
            "craft/bakery",
            "craft/confectionery",
        ],
    ),
    (
        "Groceries",
        "Butcher & Fishmonger",
        "🥩",
        [
            "shop/butcher",
            "shop/seafood",
        ],
    ),
    (
        "Groceries",
        "Greengrocer",
        "🥬",
        [
            "shop/greengrocer",
            "shop/farm",
            "shop/market",
            "amenity/marketplace",
        ],
    ),
    (
        "Groceries",
        "Off-Licence",
        "🍷",
        [
            "shop/alcohol",
            "shop/wine",
            "shop/beverages",
        ],
    ),
    (
        "Groceries",
        "Deli & Specialty",
        "🧆",
        [
            "shop/deli",
            "shop/cheese",
            "shop/chocolate",
            "shop/coffee",
            "shop/confectionery",
            "shop/dairy",
            "shop/food",
            "shop/frozen_food",
            "shop/health_food",
            "shop/ice_cream",
            "shop/nutrition_supplements",
            "shop/tea",
        ],
    ),
    (
        "Shops",
        "Fashion & Clothing",
        "👕",
        [
            "shop/clothes",
            "shop/boutique",
            "shop/shoes",
            "shop/accessories",
            "shop/bag",
            "shop/fashion_accessories",
            "shop/jewelry",
            "shop/leather",
            "shop/watches",
        ],
    ),
    (
        "Shops",
        "Electronics",
        "📱",
        [
            "shop/electronics",
            "shop/mobile_phone",
            "shop/mobile_phone_accessories",
            "shop/computer",
            "shop/appliance",
            "shop/electrical",
            "shop/hifi",
            "shop/vacuum_cleaner",
            "shop/video_games",
            "shop/games",
        ],
    ),
    (
        "Shops",
        "Charity Shop",
        "❤️",
        [
            "shop/charity",
            "shop/second_hand",
        ],
    ),
    (
        "Shops",
        "DIY & Hardware",
        "🔨",
        [
            "shop/doityourself",
            "shop/hardware",
            "shop/builders_merchant",
            "shop/paint",
            "shop/plumbing",
        ],
    ),
    (
        "Shops",
        "Home & Garden",
        "🪑",
        [
            "shop/furniture",
            "shop/garden_centre",
            "shop/garden_machinery",
            "shop/kitchen",
            "shop/bathroom",
            "shop/bathroom_furnishing",
            "shop/bed",
            "shop/carpet",
            "shop/curtain",
            "shop/flooring",
            "shop/fireplace",
            "shop/garden_furniture",
            "shop/groundskeeping",
            "shop/household",
            "shop/household_linen",
            "shop/houseware",
            "shop/homeware",
            "shop/interior_decoration",
            "shop/lighting",
            "shop/kitchenware",
            "shop/window_blind",
        ],
    ),
    (
        "Shops",
        "Bookshop",
        "📚",
        [
            "shop/books",
            "shop/stationery",
        ],
    ),
    (
        "Shops",
        "Pet Shop",
        "🐾",
        [
            "shop/pet",
        ],
    ),
    (
        "Shops",
        "Sports & Outdoor",
        "🏕️",
        [
            "shop/sports",
            "shop/angling",
            "shop/outdoor",
            "shop/bicycle",
            "shop/equestrian",
            "shop/surf",
        ],
    ),
    (
        "Shops",
        "Newsagent",
        "📰",
        [
            "shop/newsagent",
            "shop/tobacco",
        ],
    ),
    (
        "Shops",
        "Department Store",
        "🏬",
        [
            "shop/department_store",
            "shop/mall",
            "shop/variety_store",
            "shop/discount",
        ],
    ),
    (
        "Shops",
        "Gift & Hobby",
        "🎁",
        [
            "shop/gift",
            "shop/florist",
            "shop/toys",
            "shop/craft",
            "shop/candles",
            "shop/party",
            "shop/art",
            "shop/music",
            "shop/musical_instrument",
            "shop/antiques",
            "shop/anime",
            "shop/baby_goods",
            "shop/fabric",
            "shop/haberdashery",
            "shop/hobby",
            "shop/wool",
            "shop/pottery",
        ],
    ),
    (
        "Shops",
        "Specialist Shop",
        "🏪",
        [
            "shop/agrarian",
            "shop/boat",
            "shop/bookmaker",
            "shop/building_materials",
            "shop/camera",
            "shop/cannabis",
            "shop/car",
            "shop/caravan",
            "shop/catalogue",
            "shop/auction",
            "shop/auction_house",
            "shop/chandler",
            "shop/collector",
            "shop/copyshop",
            "shop/country_store",
            "shop/doors",
            "shop/e-cigarette",
            "shop/erotic",
            "shop/esoteric",
            "shop/fan",
            "shop/fireworks",
            "shop/fishing",
            "shop/frame",
            "shop/fuel",
            "shop/gas",
            "shop/hairdresser_supply",
            "shop/military_surplus",
            "shop/model",
            "shop/money_lender",
            "shop/motorcycle",
            "shop/outpost",
            "shop/pawnbroker",
            "shop/photo",
            "shop/photo_studio",
            "shop/plant_hire",
            "shop/printer_ink",
            "shop/printing",
            "shop/psychic",
            "shop/pyrotechnics",
            "shop/religion",
            "shop/rental",
            "shop/scuba_diving",
            "shop/security",
            "shop/sewing",
            "shop/ship_chandler",
            "shop/signs",
            "shop/storage_rental",
            "shop/swimming_pool",
            "shop/telecommunication",
            "shop/ticket",
            "shop/tiles",
            "shop/tool_hire",
            "shop/trade",
            "shop/trophy",
            "shop/truck",
            "shop/vacant",
            "shop/van",
            "shop/video",
            "shop/water_sports",
            "shop/weapons",
            "shop/wedding",
            "shop/wholesale",
            "shop/wigs",
            "shop/yes",
        ],
    ),
    # ── Services ─────────────────────────────────────────────
    (
        "Services",
        "Hairdresser & Beauty",
        "💇",
        [
            "shop/hairdresser",
            "shop/beauty",
            "shop/cosmetics",
            "shop/massage",
            "shop/perfumery",
            "leisure/spa",
        ],
    ),
    (
        "Services",
        "Gym & Fitness",
        "🏋️",
        [
            "leisure/fitness_centre",
            # leisure/fitness_station (outdoor pull-up bars / trim-trail
            # apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
            "amenity/dojo",
            "amenity/dancing_school",
        ],
    ),
    (
        "Services",
        "Dry Cleaner & Laundry",
        "👔",
        [
            "shop/dry_cleaning",
            "shop/laundry",
            "shop/tailor",
            "shop/shoe_repair",
            "shop/repair",
            "craft/cleaning",
            "craft/dressmaker",
            "craft/shoemaker",
            "craft/tailor",
        ],
    ),
    (
        "Services",
        "Car Services",
        "🔧",
        [
            "shop/car_repair",
            "shop/car;car_repair",
            "shop/car_parts",
            "shop/motorcycle_repair",
            "shop/tyres",
            "amenity/car_wash",
            "amenity/car_rental",
            "amenity/car_sharing",
            "amenity/bicycle_repair_station",
        ],
    ),
    (
        "Services",
        "Post Office",
        "🏤",
        [
            "amenity/post_office",
        ],
    ),
    (
        "Services",
        "Vet & Pet Care",
        "🐕",
        [
            "amenity/veterinary",
            "shop/pet_grooming",
        ],
    ),
    (
        "Services",
        "Bank",
        "🏦",
        [
            "amenity/bank",
        ],
    ),
    (
        "Services",
        "Travel Agent",
        "✈️",
        [
            "shop/travel_agency",
            "office/travel_agent",
        ],
    ),
    (
        "Services",
        "Other",
        "🛎️",
        [
            "shop/tattoo",
            "shop/piercing",
            "shop/locksmith",
            "craft/key_cutter",
        ],
    ),
    (
        "Emergency Services",
        "Police",
        "👮",
        ["amenity/police"],
    ),
    (
        "Emergency Services",
        "Fire Station",
        "🚒",
        ["amenity/fire_station"],
    ),
    (
        "Emergency Services",
        "Ambulance Station",
        "🚑",
        ["emergency/ambulance_station"],
    ),
    (
        "Health",
        "GP Surgery",
        "👨‍⚕️",
        [
            "amenity/doctors",
            "healthcare/doctor",
        ],
    ),
    (
        "Health",
        "Dentist",
        "🦷",
        [
            "amenity/dentist",
            "healthcare/dentist",
        ],
    ),
    (
        "Health",
        "Pharmacy",
        "💊",
        [
            "amenity/pharmacy",
            "healthcare/pharmacy",
            "shop/chemist",
            # healthcare/alternative, shop/herbalist and shop/health (homeopaths,
            # herbalists, generic "health" shops) are not dispensing pharmacies
            # — see DROP_CATEGORIES.
        ],
    ),
    # "Hospital & Clinic" used to be one bucket; an actual hospital and a small
    # clinic are very different amenities for a homebuyer, so they are split.
    (
        "Health",
        "Hospital",
        "🏥",
        [
            "amenity/hospital",
            "healthcare/hospital",
        ],
    ),
    (
        "Health",
        "Clinic",
        "🩺",
        [
            "amenity/clinic",
            "amenity/health_centre",
            "healthcare/blood_donation",
            "healthcare/centre",
            "healthcare/clinic",
            "office/healthcare",
            "healthcare/laboratory",
            "healthcare/rehabilitation",
            "healthcare/vaccination_centre",
            # healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
        ],
    ),
    (
        "Health",
        "Optician",
        "👓",
        [
            "shop/optician",
            "healthcare/optometrist",
            "shop/hearing_aids",
            "healthcare/audiologist",
        ],
    ),
    (
        "Health",
        "Physiotherapy",
        "🏃",
        [
            "healthcare/physiotherapist",
            "healthcare/podiatrist",
            "healthcare/occupational_therapist",
        ],
    ),
    (
        "Health",
        "Counselling & Therapy",
        "🧠",
        [
            "healthcare/counselling",
            "healthcare/psychotherapist",
            "office/therapist",
        ],
    ),
    (
        "Health",
        "Care Home",
        "🏠",
        [
            "amenity/care_home",
            "amenity/nursing_home",
            "amenity/retirement_home",
            "healthcare/hospice",
            "healthcare/nursing_home",
            "office/home_care",
        ],
    ),
    (
        "Health",
        "Medical & Mobility",
        "♿",
        [
            "shop/medical_supply",
            "shop/mobility",
            "shop/mobility_scooter",
        ],
    ),
    (
        "Culture",
        "Museum",
        "🏛️",
        [
            "tourism/museum",
        ],
    ),
    (
        "Culture",
        "Gallery",
        "🖼️",
        [
            "tourism/gallery",
            # tourism/artwork (statues, murals, village signs) was 93% of this
            # bucket and is not a visitable gallery — see DROP_CATEGORIES.
        ],
    ),
    (
        "Culture",
        "Library",
        "📚",
        [
            "amenity/library",
        ],
    ),
    (
        "Culture",
        "Place of Worship",
        "⛪",
        [
            "amenity/place_of_worship",
            "amenity/monastery",
            "building/church",
        ],
    ),
    (
        "Culture",
        "Arts Centre",
        "🎨",
        [
            "amenity/arts_centre",
        ],
    ),
    (
        "Culture",
        "Zoo",
        "🦁",
        [
            "tourism/zoo",
        ],
    ),
    (
        "Culture",
        "Tourist Attraction",
        "📸",
        [
            "tourism/attraction",
            "tourism/aquarium",
            # amenity/fountain (street furniture) and amenity/courthouse are
            # dropped; tourism/chalet (holiday lets) moved to "Hotel".
        ],
    ),
    # Note: schools come from the GIAS register (see transform_gias_schools).
    # Niche/tertiary education amenities that GIAS does not cover are dropped
    # rather than mixed in with state-funded schools.
    (
        "Local Businesses",
        "Hotel",
        "🏨",
        [
            "tourism/hotel",
            "tourism/hostel",
            "tourism/guest_house",
            "tourism/motel",
            "tourism/camp_site",
            "leisure/resort",
            "tourism/holiday_park",
            "tourism/self_catering",
            # Holiday-let chalets are accommodation, not tourist attractions
            # (where they previously sat).
            "tourism/chalet",
        ],
    ),
    (
        "Local Businesses",
        "Local Business",
        "🛠️",
        [
            # Tradespeople
            "craft/builder",
            "craft/carpenter",
            "craft/electrician",
            "craft/electronics_repair",
            "craft/floorer",
            "craft/gardener",
            "craft/glaziery",
            "craft/hvac",
            "craft/joiner",
            "craft/locksmith",
            "craft/painter",
            "craft/plumber",
            "craft/roofer",
            "craft/window_construction",
            "craft/agricultural_engines",
            "craft/atelier",
            "craft/beekeeper",
            "craft/blacksmith",
            "craft/bookbinder",
            "craft/boatbuilder",
            "craft/caterer",
            "craft/carpet_layer",
            "craft/clockmaker",
            "craft/handicraft",
            "craft/jeweller",
            "craft/metal_construction",
            "craft/photographer",
            "craft/photographic_laboratory",
            "craft/plasterer",
            "craft/pottery",
            "craft/printer",
            "craft/sawmill",
            "craft/scaffolder",
            "craft/sculptor",
            "craft/signmaker",
            "craft/stonemason",
            "craft/upholsterer",
            "craft/watchmaker",
            "craft/yes",
            "amenity/workshop",
            "shop/glaziery",
            "shop/windows",
            # Professional offices & estate agents
            "shop/estate_agent",
            "office/accountant",
            "office/architect",
            "office/auctioneer",
            "office/builder",
            "office/construction",
            "office/construction_company",
            "office/engineer",
            "office/estate_agent",
            "office/financial",
            "office/financial_advisor",
            "office/financial_services",
            "office/insurance",
            "office/lawyer",
            "office/mortgage",
            "office/property_management",
            "office/solicitor",
            "office/solicitors",
            "office/surveyor",
            "office/tax_advisor",
        ],
    ),
    (
        "Local Businesses",
        "Offices",
        "🏢",
        [
            "amenity/coworking_space",
            "amenity/research_institute",
            "office/administrative",
            "office/advertising_agency",
            "office/association",
            "office/charity",
            "office/company",
            "office/consulting",
            "office/courier",
            "office/coworking",
            "office/design",
            "office/diplomatic",
            "office/educational_institution",
            "office/employment_agency",
            "office/energy_supplier",
            "office/foundation",
            "office/government",
            "office/graphic_design",
            "office/interior_design",
            "office/it",
            "office/logistics",
            "office/marketing",
            "office/moving_company",
            "office/newspaper",
            "office/ngo",
            "office/notary",
            "office/political_party",
            "office/politician",
            "office/publisher",
            "office/quango",
            "office/recruitment",
            "office/religion",
            "office/research",
            "office/security",
            "office/taxi",
            "office/telecommunication",
            "office/transport",
            "office/union",
            "office/university",
            "office/vacant",
            "office/web_design",
            "office/yes",
        ],
    ),
    # ── Other ────────────────────────────────────────────────
    (
        "Other",
        "EV Charging",
        "🔌",
        [
            "amenity/charging_station",
        ],
    ),
    (
        "Other",
        "Fuel Station",
        "⛽",
        [
            "amenity/fuel",
        ],
    ),
    (
        "Other",
        "Community Centre",
        "🤝",
        [
            "amenity/church_hall",
            "amenity/clubhouse",
            "amenity/community_centre",
            "amenity/community_hall",
            "amenity/scout_hall",
            "amenity/social_centre",
            "amenity/townhall",
        ],
    ),
    # ── Public transport (OSM supplement to NaPTAN) ──────────
    # OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
    # / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
    # one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
    # transform() (osm_stops_near_naptan).
    (
        "Public Transport",
        "Bus stop",
        "🚏",
        [
            "public_transport/platform",
        ],
    ),
]

# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
# These tags are overwhelmingly private/incidental when unnamed: a nameless
# `leisure/garden` is a private residential garden (not a public park), and a
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
# Keeping only named instances stops them inflating Park / Sports Centre counts
# while preserving genuinely public, notable facilities (which carry a name).
REQUIRE_NAME_CATEGORIES = {
    "leisure/garden",
    "leisure/pitch",
    "leisure/practice_pitch",
    "leisure/swimming_pool",
    "leisure/paddling_pool",
    # 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
    # fishing spots; only named public facilities count as a Sports Centre.
    "leisure/track",
    "leisure/horse_riding",
    "leisure/fishing",
}


# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
    osm_key: (group, name, emoji)
    for group, name, emoji, osm_keys in _CATEGORIES
    for osm_key in osm_keys
}


NAPTAN_EMOJIS: dict[str, str] = {
    "Airport": "✈️",
    "Ferry": "⛴️",
    "Rail station": "🚆",
    "Bus stop": "🚏",
    "Bus station": "🚌",
    "Taxi rank": "🚕",
    "Tube station": "🚇",
    "Tram & Metro stop": "🚊",
}


COOP_RETAILERS = {
    "Allendale Co-operative Society",
    "Central England Co-operative",
    "Channel Islands Co-operative Society",
    "Chelmsford Star Co-operative Society",
    "Clydebank Co-operative",
    "Coniston Co-operative Society",
    "East of England Co-operative",
    "Heart of England Co-operative",
    "Langdale Co-operative Society",
    "Lincolnshire Co-operative",
    "Midcounties Co-operative",
    "Scottish Midland Co-operative",
    "Tamworth Co-operative Society",
    "The Co-operative Group",
    "The Radstock Co-operative Society",
    "The Southern Co-operative",
}

MIN_GROCERY_CHAIN_LOCATIONS = 5

GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
    "Cook": "COOK",
    "Heron": "Heron Foods",
    "Marks and Spencer": "M&S",
    "Sainsburys": "Sainsbury's",
}


GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
    "Aldi": "Aldi",
    "Aldi Local": "Aldi",
    "Asda": "Asda",
    "Asda Express": "Asda Express",
    "Asda Living": "Asda Living",
    "Asda PFS": "Asda",
    "Asda Supercentre": "Asda Supercentre",
    "Asda Supermarket": "Asda Supermarket",
    "Asda Superstore": "Asda Superstore",
    "Booths": "Booths",
    "Budgens": "Budgens",
    "Centra": "Centra",
    "Cooltrader": "Heron Foods",
    "Co-op Food": "Co-op",
    "Cook": "COOK",
    "Costco": "Costco",
    "Dunnes Stores": "Dunnes Stores",
    "Eurospar": "Spar",
    "Eurospar PFS": "Spar",
    "Farmfoods": "Farmfoods",
    "Heron": "Heron Foods",
    "Iceland": "Iceland",
    "Lidl": "Lidl",
    "Little Waitrose": "Little Waitrose",
    "Little Waitrose Shell": "Little Waitrose",
    "Makro": "Makro",
    "Marks and Spencer": "M&S",
    "Marks and Spencer BP": "M&S Food",
    "Marks and Spencer Clothing": "M&S Clothing",
    "Marks and Spencer Food To Go": "M&S Food",
    "Marks and Spencer Food Outlet": "M&S Outlet",
    "Marks and Spencer Foodhall": "M&S Food",
    "Marks and Spencer Hospital": "M&S Hospital",
    "Marks and Spencer MSA": "M&S MSA",
    "Marks and Spencer Outlet": "M&S Outlet",
    "Marks and Spencer Simply Food": "M&S Food",
    "Marks and Spencer Travel SF": "M&S Food",
    "Morrisons Daily": "Morrisons Daily",
    "Morrisons Select": "Morrisons",
    "Planet Organic": "Planet Organic",
    "Sainsbury's Local": "Sainsbury's Local",
    "Sainsburys": "Sainsbury's",
    "Sainsburys Local": "Sainsbury's Local",
    "Spar": "Spar",
    "Spar PFS": "Spar",
    "Tesco": "Tesco",
    "Tesco Express": "Tesco Express",
    "Tesco Express Esso": "Tesco Express",
    "Tesco Extra": "Tesco Extra",
    "The Co-operative Food": "Co-op",
    "The Co-operative Food PFS": "Co-op",
    "The Food Warehouse": "The Food Warehouse",
    "Waitrose": "Waitrose",
    "Waitrose MSA": "Waitrose",
    "Whole Foods Market": "Whole Foods Market",
}


def normalize_grocery_retailer(retailer: str | None) -> str:
    retailer = strip_or_empty(retailer)
    if retailer in COOP_RETAILERS:
        return "Co-op"
    return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)


def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
    if fascia:
        icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
        if icon_name is not None:
            return icon_name
    return normalize_grocery_retailer(retailer)


def transform_grocery_retail_points(
    grocery_df: pl.DataFrame,
    boundary_path: Path | None = None,
    min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
) -> pl.DataFrame:
    """Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
    required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
    missing = required - set(grocery_df.columns)
    if missing:
        raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")

    df = (
        grocery_df.select(
            pl.col("id").cast(pl.String),
            pl.col("retailer").cast(pl.String),
            pl.col("fascia").cast(pl.String),
            pl.col("store_name").cast(pl.String),
            pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
            pl.col("long_wgs").cast(pl.Float64).alias("lng"),
        )
        .with_columns(
            pl.col("retailer").str.strip_chars(),
            pl.col("fascia").str.strip_chars(),
            pl.col("store_name").str.strip_chars(),
        )
        .drop_nulls(["id", "retailer", "lat", "lng"])
        .filter(pl.col("retailer").str.len_chars() > 0)
    )

    if boundary_path is not None and len(df) > 0:
        mask = in_england_mask(
            boundary_path,
            df["lat"].to_numpy(),
            df["lng"].to_numpy(),
        )
        df = df.filter(pl.Series(mask))

    # Normalise to the display brand FIRST so the ~16 Co-op society retailer
    # names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
    # small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
    df = df.with_columns(
        pl.col("retailer")
        .map_elements(normalize_grocery_retailer, return_dtype=pl.String)
        .alias("category")
    )
    eligible_categories = (
        df.group_by("category")
        .len()
        .filter(pl.col("len") >= min_chain_locations)
        .select("category")
    )
    df = df.join(eligible_categories, on="category", how="semi")

    return df.with_columns(
        pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
        pl.coalesce(["store_name", "fascia", "retailer"])
        .str.replace_all("''", "'")
        .alias("name"),
        pl.struct(["fascia", "retailer"])
        .map_elements(
            lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
            return_dtype=pl.String,
        )
        .alias("icon_category"),
        pl.lit("Groceries").alias("group"),
        pl.lit("🛒").alias("emoji"),
    ).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")


SCHOOL_ICON_CATEGORIES: dict[str, str] = {
    "Nursery school": "🧸",
    "Primary school": "🎒",
    "Secondary school": "🏫",
    "All-through school": "🏫",
    "Sixth form": "📚",
    "Further education college": "📚",
    "University": "🎓",
    "Special school": "🤝",
    "School": "🏫",
}


def _school_icon_category_expr() -> pl.Expr:
    """Pick an icon category from GIAS phase/type_group/age_range. type_group
    wins for universities, FE colleges and special schools (which span multiple
    phases); otherwise phase determines the bucket. For independent and other
    non-statutory schools where GIAS leaves phase null, fall back to the
    age_range bounds so they still split into the right pill."""
    # GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
    # primary") so we normalise before matching.
    phase = pl.col("phase").str.to_lowercase()
    # gias._format_age_range emits three shapes: "<low>–<high>" (em-dash),
    # "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
    # integer as low and the trailing integer as high, then suppress the wrong
    # end for the one-sided shapes so they don't collapse to a single bound.
    age = pl.col("age_range")
    leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
    trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
    # "up to N": no low bound; "N+": no high bound.
    min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
    max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
    return (
        pl.when(pl.col("type_group") == "Universities")
        .then(pl.lit("University"))
        .when(pl.col("type_group") == "Special schools")
        .then(pl.lit("Special school"))
        .when(pl.col("type_group") == "Colleges")
        .then(pl.lit("Further education college"))
        .when(phase == "nursery")
        .then(pl.lit("Nursery school"))
        .when(phase.is_in(["primary", "middle deemed primary"]))
        .then(pl.lit("Primary school"))
        .when(phase.is_in(["secondary", "middle deemed secondary"]))
        .then(pl.lit("Secondary school"))
        .when(phase == "all-through")
        .then(pl.lit("All-through school"))
        .when(phase.is_in(["16 plus", "sixth form"]))
        .then(pl.lit("Sixth form"))
        # Age-range fallback for null-phase rows (≈3k Independents + Academies
        # GIAS doesn't classify by phase).
        .when(max_age <= 5)
        .then(pl.lit("Nursery school"))
        .when(min_age >= 16)
        .then(pl.lit("Sixth form"))
        .when((min_age <= 6) & (max_age >= 16))
        .then(pl.lit("All-through school"))
        .when(max_age <= 11)
        .then(pl.lit("Primary school"))
        .when(min_age >= 10)
        .then(pl.lit("Secondary school"))
        .otherwise(pl.lit("School"))
    )


OFSTED_OEIF_LABELS = {
    "1": "Outstanding",
    "2": "Good",
    "3": "Requires improvement",
    "4": "Inadequate",
}


def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    """Project the latest OEIF effectiveness grade to a human-readable label,
    keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
    the conventional Ofsted labels; when there is no usable graded result
    (null/"Not judged", e.g. schools last seen under the post-2024 ungraded
    report-card framework) we fall back to "Ungraded inspection overall outcome"
    so genuinely good/outstanding schools aren't dropped — mirroring
    school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
    grade_col = pl.col("Latest OEIF overall effectiveness")
    # See school_catchments: the ungraded outcome carries "School remains Good"/
    # "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
    # suffixes) when the graded column is null/"Not judged".
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
    label = (
        pl.when(grade_col == "1")
        .then(pl.lit(OFSTED_OEIF_LABELS["1"]))
        .when(grade_col == "2")
        .then(pl.lit(OFSTED_OEIF_LABELS["2"]))
        .when(grade_col == "3")
        .then(pl.lit(OFSTED_OEIF_LABELS["3"]))
        .when(grade_col == "4")
        .then(pl.lit(OFSTED_OEIF_LABELS["4"]))
        .when(ungraded.str.starts_with("School remains Outstanding"))
        .then(pl.lit(OFSTED_OEIF_LABELS["1"]))
        .when(ungraded.str.starts_with("School remains Good"))
        .then(pl.lit(OFSTED_OEIF_LABELS["2"]))
        .when(grade_col == "Not judged")
        .then(pl.lit("Not judged"))
        .otherwise(None)
    )
    return (
        pl.scan_parquet(ofsted_path)
        .select(
            pl.col("URN").cast(pl.Int64).alias("urn"),
            label.alias("ofsted_rating"),
        )
        .filter(pl.col("ofsted_rating").is_not_null())
    )


def transform_gias_schools(
    gias_path: Path, ofsted_path: Path, boundary_path: Path
) -> pl.LazyFrame:
    """Convert the GIAS register parquet into POI rows with school metadata.
    Ofsted ratings are joined by URN so each school carries its latest OEIF
    overall effectiveness grade (Outstanding/Good/Requires improvement/
    Inadequate/Not judged), surfaced in the map popup.

    Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
    GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
    England-only Education layer (and depress apparent Ofsted coverage, since
    Wales is inspected by Estyn, not Ofsted)."""
    icon_category_expr = _school_icon_category_expr()
    emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
    ofsted = _load_ofsted_ratings(ofsted_path)
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
    schools = (
        pl.scan_parquet(gias_path)
        .join(ofsted, on="urn", how="left")
        .select(
            pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
            pl.col("name"),
            icon_category_expr.alias("category"),
            icon_category_expr.alias("icon_category"),
            pl.lit("Education").alias("group"),
            pl.col("lat").cast(pl.Float64),
            pl.col("lng").cast(pl.Float64),
            emoji_expr.alias("emoji"),
            pl.col("phase").alias("school_phase"),
            pl.col("type").alias("school_type"),
            pl.col("type_group").alias("school_type_group"),
            pl.col("age_range").alias("school_age_range"),
            pl.col("gender").alias("school_gender"),
            pl.col("religious_character").alias("school_religious_character"),
            pl.col("admissions_policy").alias("school_admissions_policy"),
            pl.col("nursery_provision").alias("school_nursery_provision"),
            pl.col("sixth_form").alias("school_sixth_form"),
            pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
            pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
            pl.col("fsm_percent")
            .cast(pl.Float32, strict=False)
            .alias("school_fsm_percent"),
            pl.col("trust").alias("school_trust"),
            pl.col("address").alias("school_address"),
            pl.col("postcode").alias("school_postcode"),
            pl.col("local_authority").alias("school_local_authority"),
            pl.col("website").alias("school_website"),
            pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
            pl.col("head_name").alias("school_head_name"),
            pl.col("ofsted_rating").alias("school_ofsted_rating"),
        )
        .collect()
    )
    mask = in_england_mask(
        boundary_path,
        schools["lat"].to_numpy(),
        schools["lng"].to_numpy(),
    )
    return schools.filter(pl.Series(mask)).lazy()


# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
# same physical store and is dropped. Independent corner shops never carry a
# chain brand, so they are kept.
GROCERY_DEDUP_RADIUS_M = 50.0

# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
_GROCERY_TOKEN_ALIASES = {
    "cooperative": "coop",
    "cooperatives": "coop",
}


def _significant_tokens(name: str | None) -> set[str]:
    """Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
    if not name:
        return set()
    tokens: set[str] = set()
    for raw in str(name).lower().split():
        token = "".join(ch for ch in raw if ch.isalnum())
        if len(token) >= 3:
            tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
    return tokens


# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
# so the colocated OSM platform is dropped to avoid double-counting; OSM
# platforms with no nearby NaPTAN stop (the gaps) are kept.
BUS_STOP_DEDUP_RADIUS_M = 50.0


def osm_stops_near_naptan(
    osm_stops: pl.DataFrame,
    naptan_stops: pl.DataFrame,
    radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
) -> list[str]:
    """Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.

    Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
    a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
    NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
    """
    if osm_stops.is_empty() or naptan_stops.is_empty():
        return []

    from scipy.spatial import cKDTree

    n_lat = naptan_stops["lat"].to_numpy().astype(float)
    n_lng = naptan_stops["lng"].to_numpy().astype(float)
    o_lat = osm_stops["lat"].to_numpy().astype(float)
    o_lng = osm_stops["lng"].to_numpy().astype(float)
    o_ids = osm_stops["id"].to_list()

    mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
    cos_lat = float(np.cos(np.radians(mean_lat)))
    n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
    o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])

    tree = cKDTree(n_xy)
    dist, _ = tree.query(o_xy, k=1)
    return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]


def osm_groceries_colocated_with_geolytix(
    osm_groceries: pl.DataFrame,
    geolytix: pl.DataFrame,
    radius_m: float = GROCERY_DEDUP_RADIUS_M,
) -> list[str]:
    """Return OSM grocery ids that duplicate a GEOLYTIX store.

    An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
    ``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
    "Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
    physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
    match, so they are conservatively kept rather than risk a false drop.

    ``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
    ``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
    """
    if osm_groceries.is_empty() or geolytix.is_empty():
        return []

    from scipy.spatial import cKDTree

    glx_lat = geolytix["lat"].to_numpy().astype(float)
    glx_lng = geolytix["lng"].to_numpy().astype(float)
    glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]

    osm_lat = osm_groceries["lat"].to_numpy().astype(float)
    osm_lng = osm_groceries["lng"].to_numpy().astype(float)
    osm_ids = osm_groceries["id"].to_list()
    osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]

    # Equirectangular projection to metres around the shared mean latitude — at
    # England's scale this is accurate to well under the dedup radius.
    mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
    cos_lat = float(np.cos(np.radians(mean_lat)))
    glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
    osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])

    tree = cKDTree(glx_xy)
    neighbours = tree.query_ball_point(osm_xy, r=radius_m)

    drop_ids: list[str] = []
    for osm_idx, glx_indices in enumerate(neighbours):
        tokens = osm_name_tokens[osm_idx]
        if not tokens:
            continue
        for glx_idx in glx_indices:
            brand = glx_brand_tokens[glx_idx]
            if brand and brand.issubset(tokens):
                drop_ids.append(osm_ids[osm_idx])
                break
    return drop_ids


def transform(
    input_path: Path,
    naptan_path: Path,
    boundary_path: Path,
    grocery_retail_points_path: Path,
    gias_path: Path,
    ofsted_path: Path,
) -> pl.LazyFrame:
    lf = pl.scan_parquet(input_path)

    # Get all unique categories present in the data
    all_categories = (
        lf.select("category").unique().collect(engine="streaming").to_series().to_list()
    )

    # Warn about (and ignore) any category lacking a mapping
    unmapped = []
    for cat in all_categories:
        if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
            unmapped.append(cat)
    if unmapped:
        print(f"Ignoring categories missing from CATEGORY_MAP: {sorted(unmapped)}")

    # Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
    mapped_but_absent = []
    all_set = set(all_categories)
    for cat in CATEGORY_MAP:
        if cat not in all_set:
            mapped_but_absent.append(cat)
    if mapped_but_absent:
        print(
            f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
        )

    # Drop unwanted and unmapped categories
    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES) + unmapped))

    # Drop UNNAMED instances of private-dominated tags (gardens, pitches,
    # pools) so they don't inflate Park / Sports Centre proximity counts. Done
    # while `category` still holds the raw OSM key, before the friendly mapping.
    lf = lf.filter(
        ~(
            pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
            & (
                pl.col("name").is_null()
                | (pl.col("name").cast(pl.String).str.strip_chars() == "")
            )
        )
    )

    # Build lookup expressions from the 3-tuple mapping
    group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
    name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
    emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}

    # Check no friendly names or emojis are empty (defensive)
    missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
    if missing_names:
        raise ValueError(f"Empty friendly names for: {missing_names}")
    missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
    if missing_emojis:
        raise ValueError(f"Empty emojis for: {missing_emojis}")

    lf = lf.with_columns(
        pl.col("category").replace_strict(group_mapping).alias("group"),
        pl.col("category").replace_strict(name_mapping).alias("category"),
        pl.col("category").replace_strict(name_mapping).alias("icon_category"),
        pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
    )

    # A single OSM object can carry several tag keys that map to the same
    # friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
    # which pois.py emits as multiple raw rows sharing one id. Collapse those
    # duplicates so they don't inflate downstream proximity counts; rows sharing
    # an id with DIFFERENT categories are preserved. Other sources are
    # pre-deduplicated.
    lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)

    naptan_df = pl.scan_parquet(naptan_path).collect()
    mask = in_england_mask(
        boundary_path,
        naptan_df["lat"].to_numpy(),
        naptan_df["lng"].to_numpy(),
    )
    naptan_df = naptan_df.filter(pl.Series(mask))
    naptan = naptan_df.lazy().with_columns(
        pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
        pl.lit("Public Transport").alias("group"),
        pl.col("category").alias("icon_category"),
    )

    grocery_df = pl.read_parquet(grocery_retail_points_path)
    grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)

    # Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
    # colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
    osm_groceries = (
        lf.filter(pl.col("group") == "Groceries")
        .select("id", "name", "lat", "lng")
        .collect(engine="streaming")
    )
    duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
    if duplicate_ids:
        print(
            f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
            "GEOLYTIX store"
        )
        # Scope the drop to the Groceries group: a single OSM object can also
        # carry a non-grocery aspect (e.g. a convenience store that is also a
        # Post Office), which must survive — only its duplicate grocery row goes.
        lf = lf.filter(
            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
        )

    # Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
    # supplement only adds stops in NaPTAN's coverage gaps (no double-count in
    # covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
    # with NaPTAN ATCO ids.
    osm_bus_stops = (
        lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
        .select("id", "lat", "lng")
        .collect(engine="streaming")
    )
    naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
    covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
    kept_osm = osm_bus_stops.height - len(covered_bus_ids)
    print(
        f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
        f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
        f"{kept_osm:,} to fill NaPTAN gaps"
    )
    if covered_bus_ids:
        lf = lf.filter(
            ~(
                (pl.col("group") == "Public Transport")
                & (pl.col("category") == "Bus stop")
                & pl.col("id").is_in(covered_bus_ids)
            )
        )

    frames = [
        lf,
        naptan,
        grocery_pois.lazy(),
        transform_gias_schools(gias_path, ofsted_path, boundary_path),
    ]

    return pl.concat(frames, how="diagonal_relaxed")


def main():
    parser = argparse.ArgumentParser(
        description="Transform raw POIs to filtered version with friendly names"
    )
    parser.add_argument(
        "--input", type=Path, required=True, help="Raw POIs parquet file"
    )
    parser.add_argument(
        "--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
    )
    parser.add_argument(
        "--boundary",
        type=Path,
        required=True,
        help="England boundary GeoJSON file",
    )
    parser.add_argument(
        "--grocery-retail-points",
        type=Path,
        required=True,
        help="GEOLYTIX Grocery Retail Points parquet",
    )
    parser.add_argument(
        "--gias",
        type=Path,
        required=True,
        help="GIAS schools register parquet (replaces OSM schools)",
    )
    parser.add_argument(
        "--ofsted",
        type=Path,
        required=True,
        help="Ofsted latest-inspections parquet (provides per-URN ratings)",
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
    )
    args = parser.parse_args()

    df = transform(
        args.input,
        args.naptan,
        args.boundary,
        args.grocery_retail_points,
        args.gias,
        args.ofsted,
    ).collect(engine="streaming")

    df.write_parquet(args.output)

    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
    print(f"\nCategories ({df['category'].n_unique()}):")
    counts = (
        df.group_by("group", "category", "emoji").len().sort("len", descending=True)
    )
    for row in counts.iter_rows(named=True):
        print(f"  [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")


if __name__ == "__main__":
    main()