perfect-postcode/pipeline/transform/transform_poi.py
2026-06-05 10:31:00 +01:00

1739 lines
50 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
# (transform_grocery_retail_points), so drop OSM supermarkets to avoid
# double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
"shop/supermarket",
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
"amenity/bbq",
"amenity/bench",
"amenity/bicycle_parking",
"amenity/binoculars",
"amenity/boot_scraper",
"amenity/bus_garage",
"amenity/check_in",
"amenity/clock",
"amenity/clothes_dryer",
"amenity/coast_guard",
"amenity/coffin_rest",
"amenity/compressed_air",
"amenity/court_yard",
"amenity/donation_box",
"amenity/dressing_room",
"amenity/drinking_water",
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
"amenity/fixme",
"amenity/grit_bin",
"amenity/hunting_stand",
"amenity/letter_box",
"amenity/loading_dock",
"amenity/lounge",
"tourism/preserved_railway",
"amenity/lounger",
"leisure/sport",
"amenity/motorcycle_parking",
"amenity/mounting_block",
"amenity/notice_board",
"amenity/parcel_locker",
"amenity/parking",
"amenity/parking_entrance",
"amenity/parking_space",
"amenity/payment_terminal",
"amenity/photo_booth",
"amenity/piano",
"amenity/post_box",
"amenity/public_bookcase",
"amenity/reception_desk",
"amenity/sanitary_dump_station",
"amenity/shelter",
"amenity/shower",
"amenity/smoking_area",
"amenity/table",
"amenity/telephone",
"amenity/telescope",
"amenity/ticket_validator",
"amenity/toilets",
"amenity/trolley_bay",
"amenity/vacuum_cleaner",
"amenity/vending_machine",
"amenity/washing_machine",
"amenity/washingline",
"amenity/waste_basket",
"amenity/waste_disposal",
"amenity/waste_transfer_station",
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
"amenity/animal_shelter",
"amenity/boat_storage",
"amenity/bureau_de_change",
"amenity/bus_station",
"amenity/beachhut",
"amenity/canteen",
"amenity/conference_centre",
"amenity/crematorium",
"amenity/disused",
"amenity/driver_training",
"amenity/driving_school",
"amenity/escooter_rental",
"amenity/ferry_terminal",
"amenity/grave_yard",
"amenity/hall",
"shop/funeral_directors",
"amenity/kick-scooter_rental",
"amenity/money_transfer",
"amenity/post_depot",
"amenity/prison",
"amenity/public_building",
"amenity/recycling",
"amenity/scout_hut",
"amenity/social_facility",
"amenity/studio",
"amenity/student_accommodation",
"amenity/taxi",
"amenity/telephone_exchange",
"amenity/training",
"amenity/vehicle_inspection",
"amenity/waiting_room",
"amenity/yes",
"shop/disused",
"shop/no",
# Buildings (except church & university which are mapped)
"building/air_shaft",
"building/apartments",
"building/barn",
"building/bunker",
"building/chapel",
"building/commercial",
"building/construction",
"building/detached",
"building/entrance",
"building/entry",
"building/farm",
"building/farm_auxiliary",
"building/garage",
"building/garages",
"building/greenhouse",
"building/house",
"building/hut",
"building/industrial",
"building/kiosk",
"building/no",
"building/office",
"building/public",
"building/residential",
"building/retail",
"building/roof",
"building/ruins",
"building/school",
"building/semidetached_house",
"building/service",
"building/shed",
"building/terrace",
"building/warehouse",
"building/yes",
# All emergency
"emergency/access_point",
"emergency/assembly_point",
"emergency/bleed_control_kit",
"emergency/defibrillator",
"emergency/designated",
"emergency/dry_riser_inlet",
"emergency/emergency_ward_entrance",
"emergency/fire_alarm_box",
"emergency/fire_extinguisher",
"emergency/fire_hydrant",
"emergency/fire_service_inlet",
"emergency/first_aid_kit",
"emergency/life_ring",
"emergency/lifeguard",
"emergency/no",
"emergency/phone",
"emergency/rescue_equipment",
"emergency/siren",
"emergency/throw_bag",
"emergency/water_rescue",
"emergency/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/alpine_hut",
"tourism/camp_pitch",
"tourism/caravan_site",
"tourism/information",
"tourism/picnic_site",
"tourism/viewpoint",
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead)
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
# useful on a property-search map.
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
}
# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
(
"Leisure",
"Café",
"",
[
"amenity/cafe",
"amenity/ice_cream",
"amenity/internet_cafe",
],
),
(
"Leisure",
"Restaurant",
"🍽️",
[
"amenity/restaurant",
"amenity/food_court",
],
),
(
"Leisure",
"Pub",
"🍺",
[
"amenity/pub",
"amenity/beer_garden",
"amenity/biergarten",
"amenity/social_club",
"amenity/club",
"leisure/social_club",
"craft/brewery",
"craft/distillery",
"craft/winery",
],
),
(
"Leisure",
"Bar",
"🍸",
[
"amenity/bar",
"amenity/hookah_lounge",
],
),
(
"Leisure",
"Fast Food",
"🍔",
[
"amenity/fast_food",
],
),
(
"Leisure",
"Nightclub",
"🪩",
[
"amenity/nightclub",
"amenity/stripclub",
"amenity/casino",
"amenity/gambling",
],
),
(
"Leisure",
"Cinema",
"🎬",
[
"amenity/cinema",
],
),
(
"Leisure",
"Theatre",
"🎭",
[
"amenity/theatre",
],
),
(
"Leisure",
"Live Music & Events",
"🎶",
[
"amenity/music_venue",
"amenity/events_venue",
"leisure/dance",
],
),
(
"Leisure",
"Park",
"🌳",
[
"leisure/park",
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
],
),
(
"Leisure",
"Playground",
"🛝",
[
"leisure/playground",
"leisure/indoor_play",
],
),
(
"Leisure",
"Sports Centre",
"🏟️",
[
"leisure/sports_centre",
"leisure/sports_hall",
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
"leisure/miniature_golf",
"leisure/horse_riding",
"leisure/fishing",
"leisure/ice_rink",
"leisure/paddling_pool",
"leisure/practice_pitch",
"leisure/shooting_ground",
"leisure/stadium",
"leisure/swimming_pool",
"leisure/swimming_area",
"leisure/water_park",
"leisure/bathing_place",
],
),
(
"Leisure",
"Entertainment",
"🎳",
[
"leisure/bowling_alley",
"leisure/amusement_arcade",
"leisure/adult_gaming_centre",
"leisure/escape_game",
"leisure/maze",
"leisure/trampoline_park",
"leisure/sauna",
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"leisure/hackerspace",
"leisure/yes",
],
),
(
"Groceries",
"Convenience Store",
"🏪",
[
"shop/convenience",
"shop/general",
"shop/kiosk",
"shop/grocery",
],
),
(
"Groceries",
"Bakery",
"🥐",
[
"shop/bakery",
"shop/pastry",
"craft/bakery",
"craft/confectionery",
],
),
(
"Groceries",
"Butcher & Fishmonger",
"🥩",
[
"shop/butcher",
"shop/seafood",
],
),
(
"Groceries",
"Greengrocer",
"🥬",
[
"shop/greengrocer",
"shop/farm",
"shop/market",
"amenity/marketplace",
],
),
(
"Groceries",
"Off-Licence",
"🍷",
[
"shop/alcohol",
"shop/wine",
"shop/beverages",
],
),
(
"Groceries",
"Deli & Specialty",
"🧆",
[
"shop/deli",
"shop/cheese",
"shop/chocolate",
"shop/coffee",
"shop/confectionery",
"shop/dairy",
"shop/food",
"shop/frozen_food",
"shop/health_food",
"shop/ice_cream",
"shop/nutrition_supplements",
"shop/tea",
],
),
(
"Shops",
"Fashion & Clothing",
"👕",
[
"shop/clothes",
"shop/boutique",
"shop/shoes",
"shop/accessories",
"shop/bag",
"shop/fashion_accessories",
"shop/jewelry",
"shop/leather",
"shop/watches",
],
),
(
"Shops",
"Electronics",
"📱",
[
"shop/electronics",
"shop/mobile_phone",
"shop/mobile_phone_accessories",
"shop/computer",
"shop/appliance",
"shop/electrical",
"shop/hifi",
"shop/vacuum_cleaner",
"shop/video_games",
"shop/games",
],
),
(
"Shops",
"Charity Shop",
"❤️",
[
"shop/charity",
"shop/second_hand",
],
),
(
"Shops",
"DIY & Hardware",
"🔨",
[
"shop/doityourself",
"shop/hardware",
"shop/builders_merchant",
"shop/paint",
"shop/plumbing",
],
),
(
"Shops",
"Home & Garden",
"🪑",
[
"shop/furniture",
"shop/garden_centre",
"shop/garden_machinery",
"shop/kitchen",
"shop/bathroom",
"shop/bathroom_furnishing",
"shop/bed",
"shop/carpet",
"shop/curtain",
"shop/flooring",
"shop/fireplace",
"shop/garden_furniture",
"shop/groundskeeping",
"shop/household",
"shop/household_linen",
"shop/houseware",
"shop/homeware",
"shop/interior_decoration",
"shop/lighting",
"shop/kitchenware",
"shop/window_blind",
],
),
(
"Shops",
"Bookshop",
"📚",
[
"shop/books",
"shop/stationery",
],
),
(
"Shops",
"Pet Shop",
"🐾",
[
"shop/pet",
],
),
(
"Shops",
"Sports & Outdoor",
"🏕️",
[
"shop/sports",
"shop/angling",
"shop/outdoor",
"shop/bicycle",
"shop/equestrian",
"shop/surf",
],
),
(
"Shops",
"Newsagent",
"📰",
[
"shop/newsagent",
"shop/tobacco",
],
),
(
"Shops",
"Department Store",
"🏬",
[
"shop/department_store",
"shop/mall",
"shop/variety_store",
"shop/discount",
],
),
(
"Shops",
"Gift & Hobby",
"🎁",
[
"shop/gift",
"shop/florist",
"shop/toys",
"shop/craft",
"shop/candles",
"shop/party",
"shop/art",
"shop/music",
"shop/musical_instrument",
"shop/antiques",
"shop/anime",
"shop/baby_goods",
"shop/fabric",
"shop/haberdashery",
"shop/hobby",
"shop/wool",
"shop/pottery",
],
),
(
"Shops",
"Specialist Shop",
"🏪",
[
"shop/agrarian",
"shop/boat",
"shop/bookmaker",
"shop/building_materials",
"shop/camera",
"shop/cannabis",
"shop/car",
"shop/caravan",
"shop/catalogue",
"shop/auction",
"shop/auction_house",
"shop/chandler",
"shop/collector",
"shop/copyshop",
"shop/country_store",
"shop/doors",
"shop/e-cigarette",
"shop/erotic",
"shop/esoteric",
"shop/fan",
"shop/fireworks",
"shop/fishing",
"shop/frame",
"shop/fuel",
"shop/gas",
"shop/hairdresser_supply",
"shop/military_surplus",
"shop/model",
"shop/money_lender",
"shop/motorcycle",
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/photo_studio",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
"shop/psychic",
"shop/pyrotechnics",
"shop/religion",
"shop/rental",
"shop/scuba_diving",
"shop/security",
"shop/sewing",
"shop/ship_chandler",
"shop/signs",
"shop/storage_rental",
"shop/swimming_pool",
"shop/telecommunication",
"shop/ticket",
"shop/tiles",
"shop/tool_hire",
"shop/trade",
"shop/trophy",
"shop/truck",
"shop/vacant",
"shop/van",
"shop/video",
"shop/water_sports",
"shop/weapons",
"shop/wedding",
"shop/wholesale",
"shop/wigs",
"shop/yes",
],
),
# ── Services ─────────────────────────────────────────────
(
"Services",
"Hairdresser & Beauty",
"💇",
[
"shop/hairdresser",
"shop/beauty",
"shop/cosmetics",
"shop/massage",
"shop/perfumery",
"leisure/spa",
],
),
(
"Services",
"Gym & Fitness",
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
"amenity/dojo",
"amenity/dancing_school",
],
),
(
"Services",
"Dry Cleaner & Laundry",
"👔",
[
"shop/dry_cleaning",
"shop/laundry",
"shop/tailor",
"shop/shoe_repair",
"shop/repair",
"craft/cleaning",
"craft/dressmaker",
"craft/shoemaker",
"craft/tailor",
],
),
(
"Services",
"Car Services",
"🔧",
[
"shop/car_repair",
"shop/car;car_repair",
"shop/car_parts",
"shop/motorcycle_repair",
"shop/tyres",
"amenity/car_wash",
"amenity/car_rental",
"amenity/car_sharing",
"amenity/bicycle_repair_station",
],
),
(
"Services",
"Post Office",
"🏤",
[
"amenity/post_office",
],
),
(
"Services",
"Vet & Pet Care",
"🐕",
[
"amenity/veterinary",
"shop/pet_grooming",
],
),
(
"Services",
"Bank",
"🏦",
[
"amenity/bank",
],
),
(
"Services",
"Travel Agent",
"✈️",
[
"shop/travel_agency",
"office/travel_agent",
],
),
(
"Services",
"Other",
"🛎️",
[
"shop/tattoo",
"shop/piercing",
"shop/locksmith",
"craft/key_cutter",
],
),
(
"Emergency Services",
"Police",
"👮",
["amenity/police"],
),
(
"Emergency Services",
"Fire Station",
"🚒",
["amenity/fire_station"],
),
(
"Emergency Services",
"Ambulance Station",
"🚑",
["emergency/ambulance_station"],
),
(
"Health",
"GP Surgery",
"👨‍⚕️",
[
"amenity/doctors",
"healthcare/doctor",
],
),
(
"Health",
"Dentist",
"🦷",
[
"amenity/dentist",
"healthcare/dentist",
],
),
(
"Health",
"Pharmacy",
"💊",
[
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
],
),
(
"Health",
"Optician",
"👓",
[
"shop/optician",
"healthcare/optometrist",
"shop/hearing_aids",
"healthcare/audiologist",
],
),
(
"Health",
"Physiotherapy",
"🏃",
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
"healthcare/occupational_therapist",
],
),
(
"Health",
"Counselling & Therapy",
"🧠",
[
"healthcare/counselling",
"healthcare/psychotherapist",
"office/therapist",
],
),
(
"Health",
"Care Home",
"🏠",
[
"amenity/care_home",
"amenity/nursing_home",
"amenity/retirement_home",
"healthcare/hospice",
"healthcare/nursing_home",
"office/home_care",
],
),
(
"Health",
"Medical & Mobility",
"",
[
"shop/medical_supply",
"shop/mobility",
"shop/mobility_scooter",
],
),
(
"Culture",
"Museum",
"🏛️",
[
"tourism/museum",
],
),
(
"Culture",
"Gallery",
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
],
),
(
"Culture",
"Library",
"📚",
[
"amenity/library",
],
),
(
"Culture",
"Place of Worship",
"",
[
"amenity/place_of_worship",
"amenity/monastery",
"building/church",
],
),
(
"Culture",
"Arts Centre",
"🎨",
[
"amenity/arts_centre",
],
),
(
"Culture",
"Zoo",
"🦁",
[
"tourism/zoo",
],
),
(
"Culture",
"Tourist Attraction",
"📸",
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
"🏨",
[
"tourism/hotel",
"tourism/hostel",
"tourism/guest_house",
"tourism/motel",
"tourism/camp_site",
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
],
),
(
"Local Businesses",
"Local Business",
"🛠️",
[
# Tradespeople
"craft/builder",
"craft/carpenter",
"craft/electrician",
"craft/electronics_repair",
"craft/floorer",
"craft/gardener",
"craft/glaziery",
"craft/hvac",
"craft/joiner",
"craft/locksmith",
"craft/painter",
"craft/plumber",
"craft/roofer",
"craft/window_construction",
"craft/agricultural_engines",
"craft/atelier",
"craft/beekeeper",
"craft/blacksmith",
"craft/bookbinder",
"craft/boatbuilder",
"craft/caterer",
"craft/carpet_layer",
"craft/clockmaker",
"craft/handicraft",
"craft/jeweller",
"craft/metal_construction",
"craft/photographer",
"craft/photographic_laboratory",
"craft/plasterer",
"craft/pottery",
"craft/printer",
"craft/sawmill",
"craft/scaffolder",
"craft/sculptor",
"craft/signmaker",
"craft/stonemason",
"craft/upholsterer",
"craft/watchmaker",
"craft/yes",
"amenity/workshop",
"shop/glaziery",
"shop/windows",
# Professional offices & estate agents
"shop/estate_agent",
"office/accountant",
"office/architect",
"office/auctioneer",
"office/builder",
"office/construction",
"office/construction_company",
"office/engineer",
"office/estate_agent",
"office/financial",
"office/financial_advisor",
"office/financial_services",
"office/insurance",
"office/lawyer",
"office/mortgage",
"office/property_management",
"office/solicitor",
"office/solicitors",
"office/surveyor",
"office/tax_advisor",
],
),
(
"Local Businesses",
"Offices",
"🏢",
[
"amenity/coworking_space",
"amenity/research_institute",
"office/administrative",
"office/advertising_agency",
"office/association",
"office/charity",
"office/company",
"office/consulting",
"office/courier",
"office/coworking",
"office/design",
"office/diplomatic",
"office/educational_institution",
"office/employment_agency",
"office/energy_supplier",
"office/foundation",
"office/government",
"office/graphic_design",
"office/interior_design",
"office/it",
"office/logistics",
"office/marketing",
"office/moving_company",
"office/newspaper",
"office/ngo",
"office/notary",
"office/political_party",
"office/politician",
"office/publisher",
"office/quango",
"office/recruitment",
"office/religion",
"office/research",
"office/security",
"office/taxi",
"office/telecommunication",
"office/transport",
"office/union",
"office/university",
"office/vacant",
"office/web_design",
"office/yes",
],
),
# ── Other ────────────────────────────────────────────────
(
"Other",
"EV Charging",
"🔌",
[
"amenity/charging_station",
],
),
(
"Other",
"Fuel Station",
"",
[
"amenity/fuel",
],
),
(
"Other",
"Community Centre",
"🤝",
[
"amenity/church_hall",
"amenity/clubhouse",
"amenity/community_centre",
"amenity/community_hall",
"amenity/scout_hall",
"amenity/social_centre",
"amenity/townhall",
],
),
]
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
for group, name, emoji, osm_keys in _CATEGORIES
for osm_key in osm_keys
}
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
}
COOP_RETAILERS = {
"Allendale Co-operative Society",
"Central England Co-operative",
"Channel Islands Co-operative Society",
"Chelmsford Star Co-operative Society",
"Clydebank Co-operative",
"Coniston Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Langdale Co-operative Society",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Co-operative Group",
"The Radstock Co-operative Society",
"The Southern Co-operative",
}
MIN_GROCERY_CHAIN_LOCATIONS = 5
GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Cook": "COOK",
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
}
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
"Aldi": "Aldi",
"Aldi Local": "Aldi",
"Asda": "Asda",
"Asda Express": "Asda Express",
"Asda Living": "Asda Living",
"Asda PFS": "Asda",
"Asda Supercentre": "Asda Supercentre",
"Asda Supermarket": "Asda Supermarket",
"Asda Superstore": "Asda Superstore",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
"Cooltrader": "Heron Foods",
"Co-op Food": "Co-op",
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Eurospar": "Spar",
"Eurospar PFS": "Spar",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Little Waitrose": "Little Waitrose",
"Little Waitrose Shell": "Little Waitrose",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Marks and Spencer BP": "M&S Food",
"Marks and Spencer Clothing": "M&S Clothing",
"Marks and Spencer Food To Go": "M&S Food",
"Marks and Spencer Food Outlet": "M&S Outlet",
"Marks and Spencer Foodhall": "M&S Food",
"Marks and Spencer Hospital": "M&S Hospital",
"Marks and Spencer MSA": "M&S MSA",
"Marks and Spencer Outlet": "M&S Outlet",
"Marks and Spencer Simply Food": "M&S Food",
"Marks and Spencer Travel SF": "M&S Food",
"Morrisons Daily": "Morrisons Daily",
"Morrisons Select": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsbury's Local": "Sainsbury's Local",
"Sainsburys": "Sainsbury's",
"Sainsburys Local": "Sainsbury's Local",
"Spar": "Spar",
"Spar PFS": "Spar",
"Tesco": "Tesco",
"Tesco Express": "Tesco Express",
"Tesco Express Esso": "Tesco Express",
"Tesco Extra": "Tesco Extra",
"The Co-operative Food": "Co-op",
"The Co-operative Food PFS": "Co-op",
"The Food Warehouse": "The Food Warehouse",
"Waitrose": "Waitrose",
"Waitrose MSA": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
}
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
retailer = retailer.strip()
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
if icon_name is not None:
return icon_name
return normalize_grocery_retailer(retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
missing = required - set(grocery_df.columns)
if missing:
raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")
df = (
grocery_df.select(
pl.col("id").cast(pl.String),
pl.col("retailer").cast(pl.String),
pl.col("fascia").cast(pl.String),
pl.col("store_name").cast(pl.String),
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.with_columns(
pl.col("retailer").str.strip_chars(),
pl.col("fascia").str.strip_chars(),
pl.col("store_name").str.strip_chars(),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
if boundary_path is not None and len(df) > 0:
mask = in_england_mask(
boundary_path,
df["lat"].to_numpy(),
df["lng"].to_numpy(),
)
df = df.filter(pl.Series(mask))
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
df = df.with_columns(
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category")
)
eligible_categories = (
df.group_by("category")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("category")
)
df = df.join(eligible_categories, on="category", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.struct(["fascia", "retailer"])
.map_elements(
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
return_dtype=pl.String,
)
.alias("icon_category"),
pl.lit("Groceries").alias("group"),
pl.lit("🛒").alias("emoji"),
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
"Nursery school": "🧸",
"Primary school": "🎒",
"Secondary school": "🏫",
"All-through school": "🏫",
"Sixth form": "📚",
"Further education college": "📚",
"University": "🎓",
"Special school": "🤝",
"School": "🏫",
}
def _school_icon_category_expr() -> pl.Expr:
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
wins for universities, FE colleges and special schools (which span multiple
phases); otherwise phase determines the bucket. For independent and other
non-statutory schools where GIAS leaves phase null, fall back to the
age_range bounds so they still split into the right pill."""
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# gias._format_age_range emits three shapes: "<low><high>" (em-dash),
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
# integer as low and the trailing integer as high, then suppress the wrong
# end for the one-sided shapes so they don't collapse to a single bound.
age = pl.col("age_range")
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
# "up to N": no low bound; "N+": no high bound.
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
.when(pl.col("type_group") == "Special schools")
.then(pl.lit("Special school"))
.when(pl.col("type_group") == "Colleges")
.then(pl.lit("Further education college"))
.when(phase == "nursery")
.then(pl.lit("Nursery school"))
.when(phase.is_in(["primary", "middle deemed primary"]))
.then(pl.lit("Primary school"))
.when(phase.is_in(["secondary", "middle deemed secondary"]))
.then(pl.lit("Secondary school"))
.when(phase == "all-through")
.then(pl.lit("All-through school"))
.when(phase.is_in(["16 plus", "sixth form"]))
.then(pl.lit("Sixth form"))
# Age-range fallback for null-phase rows (≈3k Independents + Academies
# GIAS doesn't classify by phase).
.when(max_age <= 5)
.then(pl.lit("Nursery school"))
.when(min_age >= 16)
.then(pl.lit("Sixth form"))
.when((min_age <= 6) & (max_age >= 16))
.then(pl.lit("All-through school"))
.when(max_age <= 11)
.then(pl.lit("Primary school"))
.when(min_age >= 10)
.then(pl.lit("Secondary school"))
.otherwise(pl.lit("School"))
)
OFSTED_OEIF_LABELS = {
"1": "Outstanding",
"2": "Good",
"3": "Requires improvement",
"4": "Inadequate",
}
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; when there is no usable graded result
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(grade_col == "2")
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "3")
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(ungraded.str.starts_with("School remains Outstanding"))
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(ungraded.str.starts_with("School remains Good"))
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)
)
return (
pl.scan_parquet(ofsted_path)
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
label.alias("ofsted_rating"),
)
.filter(pl.col("ofsted_rating").is_not_null())
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return (
pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left")
.select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent")
.cast(pl.Float32, strict=False)
.alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
)
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
# same physical store and is dropped. Independent corner shops never carry a
# chain brand, so they are kept.
GROCERY_DEDUP_RADIUS_M = 50.0
# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
_GROCERY_TOKEN_ALIASES = {
"cooperative": "coop",
"cooperatives": "coop",
}
def _significant_tokens(name: str | None) -> set[str]:
"""Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
if not name:
return set()
tokens: set[str] = set()
for raw in str(name).lower().split():
token = "".join(ch for ch in raw if ch.isalnum())
if len(token) >= 3:
tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
return tokens
def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame,
radius_m: float = GROCERY_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM grocery ids that duplicate a GEOLYTIX store.
An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
"Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
match, so they are conservatively kept rather than risk a false drop.
``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
"""
if osm_groceries.is_empty() or geolytix.is_empty():
return []
from scipy.spatial import cKDTree
glx_lat = geolytix["lat"].to_numpy().astype(float)
glx_lng = geolytix["lng"].to_numpy().astype(float)
glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
osm_lat = osm_groceries["lat"].to_numpy().astype(float)
osm_lng = osm_groceries["lng"].to_numpy().astype(float)
osm_ids = osm_groceries["id"].to_list()
osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
# Equirectangular projection to metres around the shared mean latitude — at
# England's scale this is accurate to well under the dedup radius.
mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
tree = cKDTree(glx_xy)
neighbours = tree.query_ball_point(osm_xy, r=radius_m)
drop_ids: list[str] = []
for osm_idx, glx_indices in enumerate(neighbours):
tokens = osm_name_tokens[osm_idx]
if not tokens:
continue
for glx_idx in glx_indices:
brand = glx_brand_tokens[glx_idx]
if brand and brand.issubset(tokens):
drop_ids.append(osm_ids[osm_idx])
break
return drop_ids
def transform(
input_path: Path,
naptan_path: Path,
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
ofsted_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = (
lf.select("category").unique().collect(engine="streaming").to_series().to_list()
)
# Verify every non-dropped category has a mapping
unmapped = []
for cat in all_categories:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
print(
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
)
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}
# Check no friendly names or emojis are empty (defensive)
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
if missing_names:
raise ValueError(f"Empty friendly names for: {missing_names}")
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
if missing_emojis:
raise ValueError(f"Empty emojis for: {missing_emojis}")
lf = lf.with_columns(
pl.col("category").replace_strict(group_mapping).alias("group"),
pl.col("category").replace_strict(name_mapping).alias("category"),
pl.col("category").replace_strict(name_mapping).alias("icon_category"),
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
# A single OSM object can carry several tag keys that map to the same
# friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
# which pois.py emits as multiple raw rows sharing one id. Collapse those
# duplicates so they don't inflate downstream proximity counts; rows sharing
# an id with DIFFERENT categories are preserved. Other sources are
# pre-deduplicated.
lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
naptan_df = pl.scan_parquet(naptan_path).collect()
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
pl.col("category").alias("icon_category"),
)
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
# Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
# colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
osm_groceries = (
lf.filter(pl.col("group") == "Groceries")
.select("id", "name", "lat", "lng")
.collect(engine="streaming")
)
duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
if duplicate_ids:
print(
f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
"GEOLYTIX store"
)
# Scope the drop to the Groceries group: a single OSM object can also
# carry a non-grocery aspect (e.g. a convenience store that is also a
# Post Office), which must survive — only its duplicate grocery row goes.
lf = lf.filter(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
)
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
]
return pl.concat(frames, how="diagonal_relaxed")
def main():
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--grocery-retail-points",
type=Path,
required=True,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--gias",
type=Path,
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--ofsted",
type=Path,
required=True,
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(
args.input,
args.naptan,
args.boundary,
args.grocery_retail_points,
args.gias,
args.ofsted,
).collect(engine="streaming")
df.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
print(f"\nCategories ({df['category'].n_unique()}):")
counts = (
df.group_by("group", "category", "emoji").len().sort("len", descending=True)
)
for row in counts.iter_rows(named=True):
print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")
if __name__ == "__main__":
main()