perfect-postcode/pipeline/transform/transform_poi.py
2026-05-26 19:45:13 +01:00

1601 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
"amenity/bbq",
"amenity/bench",
"amenity/bicycle_parking",
"amenity/binoculars",
"amenity/boot_scraper",
"amenity/bus_garage",
"amenity/check_in",
"amenity/clock",
"amenity/clothes_dryer",
"amenity/coast_guard",
"amenity/coffin_rest",
"amenity/compressed_air",
"amenity/court_yard",
"amenity/donation_box",
"amenity/dressing_room",
"amenity/drinking_water",
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
"amenity/fixme",
"amenity/grit_bin",
"amenity/hunting_stand",
"amenity/letter_box",
"amenity/loading_dock",
"amenity/lounge",
"tourism/preserved_railway",
"amenity/lounger",
"leisure/sport",
"amenity/motorcycle_parking",
"amenity/mounting_block",
"amenity/notice_board",
"amenity/parcel_locker",
"amenity/parking",
"amenity/parking_entrance",
"amenity/parking_space",
"amenity/payment_terminal",
"amenity/photo_booth",
"amenity/piano",
"amenity/post_box",
"amenity/public_bookcase",
"amenity/reception_desk",
"amenity/sanitary_dump_station",
"amenity/shelter",
"amenity/shower",
"amenity/smoking_area",
"amenity/table",
"amenity/telephone",
"amenity/telescope",
"amenity/ticket_validator",
"amenity/toilets",
"amenity/trolley_bay",
"amenity/vacuum_cleaner",
"amenity/vending_machine",
"amenity/washing_machine",
"amenity/washingline",
"amenity/waste_basket",
"amenity/waste_disposal",
"amenity/waste_transfer_station",
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
"amenity/animal_shelter",
"amenity/boat_storage",
"amenity/bureau_de_change",
"amenity/bus_station",
"amenity/beachhut",
"amenity/canteen",
"amenity/conference_centre",
"amenity/crematorium",
"amenity/disused",
"amenity/driver_training",
"amenity/driving_school",
"amenity/escooter_rental",
"amenity/ferry_terminal",
"amenity/grave_yard",
"amenity/hall",
"shop/funeral_directors",
"amenity/kick-scooter_rental",
"amenity/money_transfer",
"amenity/post_depot",
"amenity/prison",
"amenity/public_building",
"amenity/recycling",
"amenity/scout_hut",
"amenity/social_facility",
"amenity/studio",
"amenity/student_accommodation",
"amenity/taxi",
"amenity/telephone_exchange",
"amenity/training",
"amenity/vehicle_inspection",
"amenity/waiting_room",
"amenity/yes",
"shop/disused",
"shop/no",
# Buildings (except church & university which are mapped)
"building/air_shaft",
"building/apartments",
"building/barn",
"building/bunker",
"building/chapel",
"building/commercial",
"building/construction",
"building/detached",
"building/entrance",
"building/entry",
"building/farm",
"building/farm_auxiliary",
"building/garage",
"building/garages",
"building/greenhouse",
"building/house",
"building/hut",
"building/industrial",
"building/kiosk",
"building/no",
"building/office",
"building/public",
"building/residential",
"building/retail",
"building/roof",
"building/ruins",
"building/school",
"building/semidetached_house",
"building/service",
"building/shed",
"building/terrace",
"building/warehouse",
"building/yes",
# All emergency
"emergency/access_point",
"emergency/assembly_point",
"emergency/bleed_control_kit",
"emergency/defibrillator",
"emergency/designated",
"emergency/dry_riser_inlet",
"emergency/emergency_ward_entrance",
"emergency/fire_alarm_box",
"emergency/fire_extinguisher",
"emergency/fire_hydrant",
"emergency/fire_service_inlet",
"emergency/first_aid_kit",
"emergency/life_ring",
"emergency/lifeguard",
"emergency/no",
"emergency/phone",
"emergency/rescue_equipment",
"emergency/siren",
"emergency/throw_bag",
"emergency/water_rescue",
"emergency/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/alpine_hut",
"tourism/camp_pitch",
"tourism/caravan_site",
"tourism/information",
"tourism/picnic_site",
"tourism/viewpoint",
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead)
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
# useful on a property-search map.
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
}
# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
(
"Leisure",
"Café",
"",
[
"amenity/cafe",
"amenity/ice_cream",
"amenity/internet_cafe",
],
),
(
"Leisure",
"Restaurant",
"🍽️",
[
"amenity/restaurant",
"amenity/food_court",
],
),
(
"Leisure",
"Pub",
"🍺",
[
"amenity/pub",
"amenity/beer_garden",
"amenity/biergarten",
"amenity/social_club",
"amenity/club",
"leisure/social_club",
"craft/brewery",
"craft/distillery",
"craft/winery",
],
),
(
"Leisure",
"Bar",
"🍸",
[
"amenity/bar",
"amenity/hookah_lounge",
],
),
(
"Leisure",
"Fast Food",
"🍔",
[
"amenity/fast_food",
],
),
(
"Leisure",
"Nightclub",
"🪩",
[
"amenity/nightclub",
"amenity/stripclub",
"amenity/casino",
"amenity/gambling",
],
),
(
"Leisure",
"Cinema",
"🎬",
[
"amenity/cinema",
],
),
(
"Leisure",
"Theatre",
"🎭",
[
"amenity/theatre",
],
),
(
"Leisure",
"Live Music & Events",
"🎶",
[
"amenity/music_venue",
"amenity/events_venue",
"leisure/dance",
],
),
(
"Leisure",
"Park",
"🌳",
[
"leisure/park",
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
],
),
(
"Leisure",
"Playground",
"🛝",
[
"leisure/playground",
"leisure/indoor_play",
],
),
(
"Leisure",
"Sports Centre",
"🏟️",
[
"leisure/sports_centre",
"leisure/sports_hall",
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
"leisure/miniature_golf",
"leisure/horse_riding",
"leisure/fishing",
"leisure/ice_rink",
"leisure/paddling_pool",
"leisure/practice_pitch",
"leisure/shooting_ground",
"leisure/stadium",
"leisure/swimming_pool",
"leisure/swimming_area",
"leisure/water_park",
"leisure/bathing_place",
],
),
(
"Leisure",
"Entertainment",
"🎳",
[
"leisure/bowling_alley",
"leisure/amusement_arcade",
"leisure/adult_gaming_centre",
"leisure/escape_game",
"leisure/maze",
"leisure/trampoline_park",
"leisure/sauna",
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"leisure/hackerspace",
"leisure/yes",
],
),
(
"Groceries",
"Supermarket",
"🛒",
[
"shop/supermarket",
],
),
(
"Groceries",
"Convenience Store",
"🏪",
[
"shop/convenience",
"shop/general",
"shop/kiosk",
"shop/grocery",
],
),
(
"Groceries",
"Bakery",
"🥐",
[
"shop/bakery",
"shop/pastry",
"craft/bakery",
"craft/confectionery",
],
),
(
"Groceries",
"Butcher & Fishmonger",
"🥩",
[
"shop/butcher",
"shop/seafood",
],
),
(
"Groceries",
"Greengrocer",
"🥬",
[
"shop/greengrocer",
"shop/farm",
"shop/market",
"amenity/marketplace",
],
),
(
"Groceries",
"Off-Licence",
"🍷",
[
"shop/alcohol",
"shop/wine",
"shop/beverages",
],
),
(
"Groceries",
"Deli & Specialty",
"🧆",
[
"shop/deli",
"shop/cheese",
"shop/chocolate",
"shop/coffee",
"shop/confectionery",
"shop/dairy",
"shop/food",
"shop/frozen_food",
"shop/health_food",
"shop/ice_cream",
"shop/nutrition_supplements",
"shop/tea",
],
),
(
"Shops",
"Fashion & Clothing",
"👕",
[
"shop/clothes",
"shop/boutique",
"shop/shoes",
"shop/accessories",
"shop/bag",
"shop/fashion_accessories",
"shop/jewelry",
"shop/leather",
"shop/watches",
],
),
(
"Shops",
"Electronics",
"📱",
[
"shop/electronics",
"shop/mobile_phone",
"shop/mobile_phone_accessories",
"shop/computer",
"shop/appliance",
"shop/electrical",
"shop/hifi",
"shop/vacuum_cleaner",
"shop/video_games",
"shop/games",
],
),
(
"Shops",
"Charity Shop",
"❤️",
[
"shop/charity",
"shop/second_hand",
],
),
(
"Shops",
"DIY & Hardware",
"🔨",
[
"shop/doityourself",
"shop/hardware",
"shop/builders_merchant",
"shop/paint",
"shop/plumbing",
],
),
(
"Shops",
"Home & Garden",
"🪑",
[
"shop/furniture",
"shop/garden_centre",
"shop/kitchen",
"shop/bathroom",
"shop/bathroom_furnishing",
"shop/bed",
"shop/carpet",
"shop/curtain",
"shop/flooring",
"shop/fireplace",
"shop/garden_furniture",
"shop/groundskeeping",
"shop/household",
"shop/household_linen",
"shop/houseware",
"shop/homeware",
"shop/interior_decoration",
"shop/lighting",
"shop/kitchenware",
"shop/window_blind",
],
),
(
"Shops",
"Bookshop",
"📚",
[
"shop/books",
"shop/stationery",
],
),
(
"Shops",
"Pet Shop",
"🐾",
[
"shop/pet",
],
),
(
"Shops",
"Sports & Outdoor",
"🏕️",
[
"shop/sports",
"shop/angling",
"shop/outdoor",
"shop/bicycle",
"shop/equestrian",
"shop/surf",
],
),
(
"Shops",
"Newsagent",
"📰",
[
"shop/newsagent",
"shop/tobacco",
],
),
(
"Shops",
"Department Store",
"🏬",
[
"shop/department_store",
"shop/mall",
"shop/variety_store",
"shop/discount",
],
),
(
"Shops",
"Gift & Hobby",
"🎁",
[
"shop/gift",
"shop/florist",
"shop/toys",
"shop/craft",
"shop/candles",
"shop/party",
"shop/art",
"shop/music",
"shop/musical_instrument",
"shop/antiques",
"shop/anime",
"shop/baby_goods",
"shop/fabric",
"shop/haberdashery",
"shop/hobby",
"shop/wool",
"shop/pottery",
],
),
(
"Shops",
"Specialist Shop",
"🏪",
[
"shop/agrarian",
"shop/boat",
"shop/bookmaker",
"shop/building_materials",
"shop/camera",
"shop/cannabis",
"shop/car",
"shop/caravan",
"shop/catalogue",
"shop/auction",
"shop/auction_house",
"shop/chandler",
"shop/collector",
"shop/copyshop",
"shop/country_store",
"shop/doors",
"shop/e-cigarette",
"shop/erotic",
"shop/esoteric",
"shop/fan",
"shop/fireworks",
"shop/fishing",
"shop/frame",
"shop/fuel",
"shop/gas",
"shop/hairdresser_supply",
"shop/military_surplus",
"shop/model",
"shop/money_lender",
"shop/motorcycle",
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/photo_studio",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
"shop/psychic",
"shop/pyrotechnics",
"shop/religion",
"shop/rental",
"shop/scuba_diving",
"shop/security",
"shop/sewing",
"shop/ship_chandler",
"shop/signs",
"shop/storage_rental",
"shop/swimming_pool",
"shop/telecommunication",
"shop/ticket",
"shop/tiles",
"shop/tool_hire",
"shop/trade",
"shop/trophy",
"shop/truck",
"shop/vacant",
"shop/van",
"shop/video",
"shop/water_sports",
"shop/weapons",
"shop/wedding",
"shop/wholesale",
"shop/wigs",
"shop/yes",
],
),
# ── Services ─────────────────────────────────────────────
(
"Services",
"Hairdresser & Beauty",
"💇",
[
"shop/hairdresser",
"shop/beauty",
"shop/cosmetics",
"shop/massage",
"shop/perfumery",
"leisure/spa",
],
),
(
"Services",
"Gym & Fitness",
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
"amenity/dojo",
"amenity/dancing_school",
],
),
(
"Services",
"Dry Cleaner & Laundry",
"👔",
[
"shop/dry_cleaning",
"shop/laundry",
"shop/tailor",
"shop/shoe_repair",
"shop/repair",
"craft/cleaning",
"craft/dressmaker",
"craft/shoemaker",
"craft/tailor",
],
),
(
"Services",
"Car Services",
"🔧",
[
"shop/car_repair",
"shop/car;car_repair",
"shop/car_parts",
"shop/motorcycle_repair",
"shop/tyres",
"amenity/car_wash",
"amenity/car_rental",
"amenity/car_sharing",
"amenity/bicycle_repair_station",
],
),
(
"Services",
"Post Office",
"🏤",
[
"amenity/post_office",
],
),
(
"Services",
"Vet & Pet Care",
"🐕",
[
"amenity/veterinary",
"shop/pet_grooming",
],
),
(
"Services",
"Bank",
"🏦",
[
"amenity/bank",
],
),
(
"Services",
"Travel Agent",
"✈️",
[
"shop/travel_agency",
"office/travel_agent",
],
),
(
"Services",
"Other",
"🛎️",
[
"shop/tattoo",
"shop/piercing",
"shop/locksmith",
"craft/key_cutter",
],
),
(
"Emergency Services",
"Police",
"👮",
["amenity/police"],
),
(
"Emergency Services",
"Fire Station",
"🚒",
["amenity/fire_station"],
),
(
"Emergency Services",
"Ambulance Station",
"🚑",
["emergency/ambulance_station"],
),
(
"Health",
"GP Surgery",
"👨‍⚕️",
[
"amenity/doctors",
"healthcare/doctor",
],
),
(
"Health",
"Dentist",
"🦷",
[
"amenity/dentist",
"healthcare/dentist",
],
),
(
"Health",
"Pharmacy",
"💊",
[
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
],
),
(
"Health",
"Optician",
"👓",
[
"shop/optician",
"healthcare/optometrist",
"shop/hearing_aids",
"healthcare/audiologist",
],
),
(
"Health",
"Physiotherapy",
"🏃",
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
"healthcare/occupational_therapist",
],
),
(
"Health",
"Counselling & Therapy",
"🧠",
[
"healthcare/counselling",
"healthcare/psychotherapist",
"office/therapist",
],
),
(
"Health",
"Care Home",
"🏠",
[
"amenity/care_home",
"amenity/nursing_home",
"amenity/retirement_home",
"healthcare/hospice",
"healthcare/nursing_home",
"office/home_care",
],
),
(
"Health",
"Medical & Mobility",
"",
[
"shop/medical_supply",
"shop/mobility",
"shop/mobility_scooter",
],
),
(
"Culture",
"Museum",
"🏛️",
[
"tourism/museum",
],
),
(
"Culture",
"Gallery",
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
],
),
(
"Culture",
"Library",
"📚",
[
"amenity/library",
],
),
(
"Culture",
"Place of Worship",
"",
[
"amenity/place_of_worship",
"amenity/monastery",
"building/church",
],
),
(
"Culture",
"Arts Centre",
"🎨",
[
"amenity/arts_centre",
],
),
(
"Culture",
"Zoo",
"🦁",
[
"tourism/zoo",
],
),
(
"Culture",
"Tourist Attraction",
"📸",
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
"🏨",
[
"tourism/hotel",
"tourism/hostel",
"tourism/guest_house",
"tourism/motel",
"tourism/camp_site",
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
],
),
(
"Local Businesses",
"Local Business",
"🛠️",
[
# Tradespeople
"craft/builder",
"craft/carpenter",
"craft/electrician",
"craft/electronics_repair",
"craft/floorer",
"craft/gardener",
"craft/glaziery",
"craft/hvac",
"craft/joiner",
"craft/locksmith",
"craft/painter",
"craft/plumber",
"craft/roofer",
"craft/window_construction",
"craft/agricultural_engines",
"craft/atelier",
"craft/beekeeper",
"craft/blacksmith",
"craft/bookbinder",
"craft/boatbuilder",
"craft/caterer",
"craft/carpet_layer",
"craft/clockmaker",
"craft/handicraft",
"craft/jeweller",
"craft/metal_construction",
"craft/photographer",
"craft/photographic_laboratory",
"craft/plasterer",
"craft/pottery",
"craft/printer",
"craft/sawmill",
"craft/scaffolder",
"craft/sculptor",
"craft/signmaker",
"craft/stonemason",
"craft/upholsterer",
"craft/watchmaker",
"craft/yes",
"amenity/workshop",
"shop/glaziery",
"shop/windows",
# Professional offices & estate agents
"shop/estate_agent",
"office/accountant",
"office/architect",
"office/auctioneer",
"office/builder",
"office/construction",
"office/construction_company",
"office/engineer",
"office/estate_agent",
"office/financial",
"office/financial_advisor",
"office/financial_services",
"office/insurance",
"office/lawyer",
"office/mortgage",
"office/property_management",
"office/solicitor",
"office/solicitors",
"office/surveyor",
"office/tax_advisor",
],
),
(
"Local Businesses",
"Offices",
"🏢",
[
"amenity/coworking_space",
"amenity/research_institute",
"office/administrative",
"office/advertising_agency",
"office/association",
"office/charity",
"office/company",
"office/consulting",
"office/courier",
"office/coworking",
"office/design",
"office/diplomatic",
"office/educational_institution",
"office/employment_agency",
"office/energy_supplier",
"office/foundation",
"office/government",
"office/graphic_design",
"office/interior_design",
"office/it",
"office/logistics",
"office/marketing",
"office/moving_company",
"office/newspaper",
"office/ngo",
"office/notary",
"office/political_party",
"office/politician",
"office/publisher",
"office/quango",
"office/recruitment",
"office/religion",
"office/research",
"office/security",
"office/taxi",
"office/telecommunication",
"office/transport",
"office/union",
"office/university",
"office/vacant",
"office/web_design",
"office/yes",
],
),
# ── Other ────────────────────────────────────────────────
(
"Other",
"EV Charging",
"🔌",
[
"amenity/charging_station",
],
),
(
"Other",
"Fuel Station",
"",
[
"amenity/fuel",
],
),
(
"Other",
"Community Centre",
"🤝",
[
"amenity/church_hall",
"amenity/clubhouse",
"amenity/community_centre",
"amenity/community_hall",
"amenity/scout_hall",
"amenity/social_centre",
"amenity/townhall",
],
),
]
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
for group, name, emoji, osm_keys in _CATEGORIES
for osm_key in osm_keys
}
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
}
COOP_RETAILERS = {
"Allendale Co-operative Society",
"Central England Co-operative",
"Channel Islands Co-operative Society",
"Chelmsford Star Co-operative Society",
"Clydebank Co-operative",
"Coniston Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Langdale Co-operative Society",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Co-operative Group",
"The Radstock Co-operative Society",
"The Southern Co-operative",
}
MIN_GROCERY_CHAIN_LOCATIONS = 5
GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Cook": "COOK",
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
}
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
"Aldi": "Aldi",
"Aldi Local": "Aldi",
"Asda": "Asda",
"Asda Express": "Asda Express",
"Asda Living": "Asda Living",
"Asda PFS": "Asda",
"Asda Supercentre": "Asda Supercentre",
"Asda Supermarket": "Asda Supermarket",
"Asda Superstore": "Asda Superstore",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
"Cooltrader": "Heron Foods",
"Co-op Food": "Co-op",
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Eurospar": "Spar",
"Eurospar PFS": "Spar",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Little Waitrose": "Little Waitrose",
"Little Waitrose Shell": "Little Waitrose",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Marks and Spencer BP": "M&S Food",
"Marks and Spencer Clothing": "M&S Clothing",
"Marks and Spencer Food To Go": "M&S Food",
"Marks and Spencer Food Outlet": "M&S Outlet",
"Marks and Spencer Foodhall": "M&S Food",
"Marks and Spencer Hospital": "M&S Hospital",
"Marks and Spencer MSA": "M&S MSA",
"Marks and Spencer Outlet": "M&S Outlet",
"Marks and Spencer Simply Food": "M&S Food",
"Marks and Spencer Travel SF": "M&S Food",
"Morrisons Daily": "Morrisons Daily",
"Morrisons Select": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsbury's Local": "Sainsbury's Local",
"Sainsburys": "Sainsbury's",
"Sainsburys Local": "Sainsbury's Local",
"Spar": "Spar",
"Spar PFS": "Spar",
"Tesco": "Tesco",
"Tesco Express": "Tesco Express",
"Tesco Express Esso": "Tesco Express",
"Tesco Extra": "Tesco Extra",
"The Co-operative Food": "Co-op",
"The Co-operative Food PFS": "Co-op",
"The Food Warehouse": "The Food Warehouse",
"Waitrose": "Waitrose",
"Waitrose MSA": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
}
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
retailer = retailer.strip()
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
if icon_name is not None:
return icon_name
return normalize_grocery_retailer(retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
missing = required - set(grocery_df.columns)
if missing:
raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")
df = (
grocery_df.select(
pl.col("id").cast(pl.String),
pl.col("retailer").cast(pl.String),
pl.col("fascia").cast(pl.String),
pl.col("store_name").cast(pl.String),
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.with_columns(
pl.col("retailer").str.strip_chars(),
pl.col("fascia").str.strip_chars(),
pl.col("store_name").str.strip_chars(),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
if boundary_path is not None and len(df) > 0:
mask = in_england_mask(
boundary_path,
df["lat"].to_numpy(),
df["lng"].to_numpy(),
)
df = df.filter(pl.Series(mask))
eligible_retailers = (
df.group_by("retailer")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("retailer")
)
df = df.join(eligible_retailers, on="retailer", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category"),
pl.struct(["fascia", "retailer"])
.map_elements(
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
return_dtype=pl.String,
)
.alias("icon_category"),
pl.lit("Groceries").alias("group"),
pl.lit("🛒").alias("emoji"),
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
"Nursery school": "🧸",
"Primary school": "🎒",
"Secondary school": "🏫",
"All-through school": "🏫",
"Sixth form": "📚",
"Further education college": "📚",
"University": "🎓",
"Special school": "🤝",
"School": "🏫",
}
def _school_icon_category_expr() -> pl.Expr:
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
wins for universities, FE colleges and special schools (which span multiple
phases); otherwise phase determines the bucket. For independent and other
non-statutory schools where GIAS leaves phase null, fall back to the
age_range bounds so they still split into the right pill."""
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# age_range is "<min><max>" using an em-dash; both ends may be missing.
age_parts = pl.col("age_range").str.split_exact("", 1)
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
.when(pl.col("type_group") == "Special schools")
.then(pl.lit("Special school"))
.when(pl.col("type_group") == "Colleges")
.then(pl.lit("Further education college"))
.when(phase == "nursery")
.then(pl.lit("Nursery school"))
.when(phase.is_in(["primary", "middle deemed primary"]))
.then(pl.lit("Primary school"))
.when(phase.is_in(["secondary", "middle deemed secondary"]))
.then(pl.lit("Secondary school"))
.when(phase == "all-through")
.then(pl.lit("All-through school"))
.when(phase.is_in(["16 plus", "sixth form"]))
.then(pl.lit("Sixth form"))
# Age-range fallback for null-phase rows (≈3k Independents + Academies
# GIAS doesn't classify by phase).
.when(max_age <= 5)
.then(pl.lit("Nursery school"))
.when(min_age >= 16)
.then(pl.lit("Sixth form"))
.when((min_age <= 6) & (max_age >= 16))
.then(pl.lit("All-through school"))
.when(max_age <= 11)
.then(pl.lit("Primary school"))
.when(min_age >= 10)
.then(pl.lit("Secondary school"))
.otherwise(pl.lit("School"))
)
OFSTED_OEIF_LABELS = {
"1": "Outstanding",
"2": "Good",
"3": "Requires improvement",
"4": "Inadequate",
}
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
only have a report card) is preserved verbatim; null grades drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(grade_col == "2")
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "3")
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)
)
return (
pl.scan_parquet(ofsted_path)
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
label.alias("ofsted_rating"),
)
.filter(pl.col("ofsted_rating").is_not_null())
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
def transform(
input_path: Path,
naptan_path: Path,
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
ofsted_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = (
lf.select("category").unique().collect(engine="streaming").to_series().to_list()
)
# Verify every non-dropped category has a mapping
unmapped = []
for cat in all_categories:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
print(
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
)
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}
# Check no friendly names or emojis are empty (defensive)
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
if missing_names:
raise ValueError(f"Empty friendly names for: {missing_names}")
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
if missing_emojis:
raise ValueError(f"Empty emojis for: {missing_emojis}")
lf = lf.with_columns(
pl.col("category").replace_strict(group_mapping).alias("group"),
pl.col("category").replace_strict(name_mapping).alias("category"),
pl.col("category").replace_strict(name_mapping).alias("icon_category"),
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan_df = pl.scan_parquet(naptan_path).collect()
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
pl.col("category").alias("icon_category"),
)
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
]
return pl.concat(frames, how="diagonal_relaxed")
def main():
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--grocery-retail-points",
type=Path,
required=True,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--gias",
type=Path,
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--ofsted",
type=Path,
required=True,
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(
args.input,
args.naptan,
args.boundary,
args.grocery_retail_points,
args.gias,
args.ofsted,
).collect(engine="streaming")
df.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
print(f"\nCategories ({df['category'].n_unique()}):")
counts = (
df.group_by("group", "category", "emoji").len().sort("len", descending=True)
)
for row in counts.iter_rows(named=True):
print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")
if __name__ == "__main__":
main()