perfect-postcode/pipeline/transform/transform_poi.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

1910 lines
58 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
from pipeline.utils.normalize import strip_or_empty
DROP_CATEGORIES = {
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
# (transform_grocery_retail_points), so drop OSM supermarkets to avoid
# double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
"shop/supermarket",
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
"amenity/bbq",
"amenity/bench",
"amenity/bicycle_parking",
"amenity/binoculars",
"amenity/boot_scraper",
"amenity/bus_garage",
"amenity/check_in",
"amenity/clock",
"amenity/clothes_dryer",
"amenity/coast_guard",
"amenity/coffin_rest",
"amenity/compressed_air",
"amenity/court_yard",
"amenity/donation_box",
"amenity/dressing_room",
"amenity/drinking_water",
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
# Park "furniture" / incidental features — not parks; they massively
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
"amenity/fixme",
"amenity/grit_bin",
"amenity/hunting_stand",
"amenity/letter_box",
"amenity/loading_dock",
"amenity/lounge",
"tourism/preserved_railway",
"amenity/lounger",
"leisure/sport",
"amenity/motorcycle_parking",
"amenity/mounting_block",
"amenity/notice_board",
"amenity/parcel_locker",
"amenity/parking",
"amenity/parking_entrance",
"amenity/parking_space",
"amenity/payment_terminal",
"amenity/photo_booth",
"amenity/piano",
"amenity/post_box",
"amenity/public_bookcase",
"amenity/reception_desk",
"amenity/sanitary_dump_station",
"amenity/shelter",
"amenity/shower",
"amenity/smoking_area",
"amenity/table",
"amenity/telephone",
"amenity/telescope",
"amenity/ticket_validator",
"amenity/toilets",
"amenity/trolley_bay",
"amenity/vacuum_cleaner",
"amenity/vending_machine",
"amenity/washing_machine",
"amenity/washingline",
"amenity/waste_basket",
"amenity/waste_disposal",
"amenity/waste_transfer_station",
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Boating/cycle-hire infrastructure formerly miscategorised as
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
# ramps and moorings are not entertainment venues.
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
"tourism/artwork",
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
# "Gym & Fitness".
"leisure/fitness_station",
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
# under "Hospital & Clinic" / "Pharmacy".
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
# Street fountains and courthouses formerly bucketed as
# "Tourist Attraction".
"amenity/fountain",
"amenity/courthouse",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
"amenity/animal_shelter",
"amenity/boat_storage",
"amenity/bureau_de_change",
"amenity/bus_station",
"amenity/beachhut",
"amenity/canteen",
"amenity/conference_centre",
"amenity/crematorium",
"amenity/disused",
"amenity/driver_training",
"amenity/driving_school",
"amenity/escooter_rental",
"amenity/ferry_terminal",
"amenity/grave_yard",
"amenity/hall",
"shop/funeral_directors",
"amenity/kick-scooter_rental",
"amenity/money_transfer",
"amenity/post_depot",
"amenity/prison",
"amenity/public_building",
"amenity/recycling",
"amenity/scout_hut",
"amenity/social_facility",
"amenity/studio",
"amenity/student_accommodation",
"amenity/taxi",
"amenity/telephone_exchange",
"amenity/training",
"amenity/vehicle_inspection",
"amenity/waiting_room",
"amenity/yes",
"shop/disused",
"shop/no",
# Buildings (except church & university which are mapped)
"building/air_shaft",
"building/apartments",
"building/barn",
"building/bunker",
"building/chapel",
"building/commercial",
"building/construction",
"building/detached",
"building/entrance",
"building/entry",
"building/farm",
"building/farm_auxiliary",
"building/garage",
"building/garages",
"building/greenhouse",
"building/house",
"building/hut",
"building/industrial",
"building/kiosk",
"building/no",
"building/office",
"building/public",
"building/residential",
"building/retail",
"building/roof",
"building/ruins",
"building/school",
"building/semidetached_house",
"building/service",
"building/shed",
"building/terrace",
"building/warehouse",
"building/yes",
# All emergency
"emergency/access_point",
"emergency/assembly_point",
"emergency/bleed_control_kit",
"emergency/defibrillator",
"emergency/designated",
"emergency/dry_riser_inlet",
"emergency/emergency_ward_entrance",
"emergency/fire_alarm_box",
"emergency/fire_extinguisher",
"emergency/fire_hydrant",
"emergency/fire_service_inlet",
"emergency/first_aid_kit",
"emergency/life_ring",
"emergency/lifeguard",
"emergency/no",
"emergency/phone",
"emergency/rescue_equipment",
"emergency/siren",
"emergency/throw_bag",
"emergency/water_rescue",
"emergency/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/alpine_hut",
"tourism/camp_pitch",
"tourism/caravan_site",
"tourism/information",
"tourism/picnic_site",
"tourism/viewpoint",
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead). public_transport/platform is the
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
# a single stop. stop_position is left dropped to avoid double-counting the
# same stop (platform + stop_position).
"public_transport/entrance",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
# useful on a property-search map.
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
}
# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
(
"Leisure",
"Café",
"",
[
"amenity/cafe",
"amenity/ice_cream",
"amenity/internet_cafe",
],
),
(
"Leisure",
"Restaurant",
"🍽️",
[
"amenity/restaurant",
"amenity/food_court",
],
),
(
"Leisure",
"Pub",
"🍺",
[
"amenity/pub",
"amenity/beer_garden",
"amenity/biergarten",
"amenity/social_club",
"amenity/club",
"leisure/social_club",
"craft/brewery",
"craft/distillery",
"craft/winery",
],
),
(
"Leisure",
"Bar",
"🍸",
[
"amenity/bar",
"amenity/hookah_lounge",
],
),
(
"Leisure",
"Fast Food",
"🍔",
[
"amenity/fast_food",
],
),
(
"Leisure",
"Nightclub",
"🪩",
[
"amenity/nightclub",
"amenity/stripclub",
"amenity/casino",
"amenity/gambling",
],
),
(
"Leisure",
"Cinema",
"🎬",
[
"amenity/cinema",
],
),
(
"Leisure",
"Theatre",
"🎭",
[
"amenity/theatre",
],
),
(
"Leisure",
"Live Music & Events",
"🎶",
[
"amenity/music_venue",
"amenity/events_venue",
"leisure/dance",
],
),
(
"Leisure",
"Park",
"🌳",
[
"leisure/park",
# leisure/garden is dominated by private residential gardens (98%+
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
# so only named (public/notable) gardens count as a Park.
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
],
),
(
"Leisure",
"Playground",
"🛝",
[
"leisure/playground",
"leisure/indoor_play",
],
),
(
"Leisure",
"Sports Centre",
"🏟️",
[
"leisure/sports_centre",
"leisure/sports_hall",
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
# (98% unnamed = private/garden pools) are name-gated in transform()
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
"leisure/miniature_golf",
"leisure/horse_riding",
"leisure/fishing",
"leisure/ice_rink",
"leisure/paddling_pool",
"leisure/practice_pitch",
"leisure/shooting_ground",
"leisure/stadium",
"leisure/swimming_pool",
"leisure/swimming_area",
"leisure/water_park",
"leisure/bathing_place",
],
),
(
"Leisure",
"Entertainment",
"🎳",
[
"leisure/bowling_alley",
"leisure/amusement_arcade",
"leisure/adult_gaming_centre",
"leisure/escape_game",
"leisure/maze",
"leisure/trampoline_park",
"leisure/sauna",
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
# bicycle_rental/boat_rental/marina/slipway used to live here and
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
"leisure/hackerspace",
"leisure/yes",
],
),
(
"Groceries",
"Convenience Store",
"🏪",
[
"shop/convenience",
"shop/general",
"shop/kiosk",
"shop/grocery",
],
),
(
"Groceries",
"Bakery",
"🥐",
[
"shop/bakery",
"shop/pastry",
"craft/bakery",
"craft/confectionery",
],
),
(
"Groceries",
"Butcher & Fishmonger",
"🥩",
[
"shop/butcher",
"shop/seafood",
],
),
(
"Groceries",
"Greengrocer",
"🥬",
[
"shop/greengrocer",
"shop/farm",
"shop/market",
"amenity/marketplace",
],
),
(
"Groceries",
"Off-Licence",
"🍷",
[
"shop/alcohol",
"shop/wine",
"shop/beverages",
],
),
(
"Groceries",
"Deli & Specialty",
"🧆",
[
"shop/deli",
"shop/cheese",
"shop/chocolate",
"shop/coffee",
"shop/confectionery",
"shop/dairy",
"shop/food",
"shop/frozen_food",
"shop/health_food",
"shop/ice_cream",
"shop/nutrition_supplements",
"shop/tea",
],
),
(
"Shops",
"Fashion & Clothing",
"👕",
[
"shop/clothes",
"shop/boutique",
"shop/shoes",
"shop/accessories",
"shop/bag",
"shop/fashion_accessories",
"shop/jewelry",
"shop/leather",
"shop/watches",
],
),
(
"Shops",
"Electronics",
"📱",
[
"shop/electronics",
"shop/mobile_phone",
"shop/mobile_phone_accessories",
"shop/computer",
"shop/appliance",
"shop/electrical",
"shop/hifi",
"shop/vacuum_cleaner",
"shop/video_games",
"shop/games",
],
),
(
"Shops",
"Charity Shop",
"❤️",
[
"shop/charity",
"shop/second_hand",
],
),
(
"Shops",
"DIY & Hardware",
"🔨",
[
"shop/doityourself",
"shop/hardware",
"shop/builders_merchant",
"shop/paint",
"shop/plumbing",
],
),
(
"Shops",
"Home & Garden",
"🪑",
[
"shop/furniture",
"shop/garden_centre",
"shop/garden_machinery",
"shop/kitchen",
"shop/bathroom",
"shop/bathroom_furnishing",
"shop/bed",
"shop/carpet",
"shop/curtain",
"shop/flooring",
"shop/fireplace",
"shop/garden_furniture",
"shop/groundskeeping",
"shop/household",
"shop/household_linen",
"shop/houseware",
"shop/homeware",
"shop/interior_decoration",
"shop/lighting",
"shop/kitchenware",
"shop/window_blind",
],
),
(
"Shops",
"Bookshop",
"📚",
[
"shop/books",
"shop/stationery",
],
),
(
"Shops",
"Pet Shop",
"🐾",
[
"shop/pet",
],
),
(
"Shops",
"Sports & Outdoor",
"🏕️",
[
"shop/sports",
"shop/angling",
"shop/outdoor",
"shop/bicycle",
"shop/equestrian",
"shop/surf",
],
),
(
"Shops",
"Newsagent",
"📰",
[
"shop/newsagent",
"shop/tobacco",
],
),
(
"Shops",
"Department Store",
"🏬",
[
"shop/department_store",
"shop/mall",
"shop/variety_store",
"shop/discount",
],
),
(
"Shops",
"Gift & Hobby",
"🎁",
[
"shop/gift",
"shop/florist",
"shop/toys",
"shop/craft",
"shop/candles",
"shop/party",
"shop/art",
"shop/music",
"shop/musical_instrument",
"shop/antiques",
"shop/anime",
"shop/baby_goods",
"shop/fabric",
"shop/haberdashery",
"shop/hobby",
"shop/wool",
"shop/pottery",
],
),
(
"Shops",
"Specialist Shop",
"🏪",
[
"shop/agrarian",
"shop/boat",
"shop/bookmaker",
"shop/building_materials",
"shop/camera",
"shop/cannabis",
"shop/car",
"shop/caravan",
"shop/catalogue",
"shop/auction",
"shop/auction_house",
"shop/chandler",
"shop/collector",
"shop/copyshop",
"shop/country_store",
"shop/doors",
"shop/e-cigarette",
"shop/erotic",
"shop/esoteric",
"shop/fan",
"shop/fireworks",
"shop/fishing",
"shop/frame",
"shop/fuel",
"shop/gas",
"shop/hairdresser_supply",
"shop/military_surplus",
"shop/model",
"shop/money_lender",
"shop/motorcycle",
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/photo_studio",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
"shop/psychic",
"shop/pyrotechnics",
"shop/religion",
"shop/rental",
"shop/scuba_diving",
"shop/security",
"shop/sewing",
"shop/ship_chandler",
"shop/signs",
"shop/storage_rental",
"shop/swimming_pool",
"shop/telecommunication",
"shop/ticket",
"shop/tiles",
"shop/tool_hire",
"shop/trade",
"shop/trophy",
"shop/truck",
"shop/vacant",
"shop/van",
"shop/video",
"shop/water_sports",
"shop/weapons",
"shop/wedding",
"shop/wholesale",
"shop/wigs",
"shop/yes",
],
),
# ── Services ─────────────────────────────────────────────
(
"Services",
"Hairdresser & Beauty",
"💇",
[
"shop/hairdresser",
"shop/beauty",
"shop/cosmetics",
"shop/massage",
"shop/perfumery",
"leisure/spa",
],
),
(
"Services",
"Gym & Fitness",
"🏋️",
[
"leisure/fitness_centre",
# leisure/fitness_station (outdoor pull-up bars / trim-trail
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
"amenity/dojo",
"amenity/dancing_school",
],
),
(
"Services",
"Dry Cleaner & Laundry",
"👔",
[
"shop/dry_cleaning",
"shop/laundry",
"shop/tailor",
"shop/shoe_repair",
"shop/repair",
"craft/cleaning",
"craft/dressmaker",
"craft/shoemaker",
"craft/tailor",
],
),
(
"Services",
"Car Services",
"🔧",
[
"shop/car_repair",
"shop/car;car_repair",
"shop/car_parts",
"shop/motorcycle_repair",
"shop/tyres",
"amenity/car_wash",
"amenity/car_rental",
"amenity/car_sharing",
"amenity/bicycle_repair_station",
],
),
(
"Services",
"Post Office",
"🏤",
[
"amenity/post_office",
],
),
(
"Services",
"Vet & Pet Care",
"🐕",
[
"amenity/veterinary",
"shop/pet_grooming",
],
),
(
"Services",
"Bank",
"🏦",
[
"amenity/bank",
],
),
(
"Services",
"Travel Agent",
"✈️",
[
"shop/travel_agency",
"office/travel_agent",
],
),
(
"Services",
"Other",
"🛎️",
[
"shop/tattoo",
"shop/piercing",
"shop/locksmith",
"craft/key_cutter",
],
),
(
"Emergency Services",
"Police",
"👮",
["amenity/police"],
),
(
"Emergency Services",
"Fire Station",
"🚒",
["amenity/fire_station"],
),
(
"Emergency Services",
"Ambulance Station",
"🚑",
["emergency/ambulance_station"],
),
(
"Health",
"GP Surgery",
"👨‍⚕️",
[
"amenity/doctors",
"healthcare/doctor",
],
),
(
"Health",
"Dentist",
"🦷",
[
"amenity/dentist",
"healthcare/dentist",
],
),
(
"Health",
"Pharmacy",
"💊",
[
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
# herbalists, generic "health" shops) are not dispensing pharmacies
# — see DROP_CATEGORIES.
],
),
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
# clinic are very different amenities for a homebuyer, so they are split.
(
"Health",
"Hospital",
"🏥",
[
"amenity/hospital",
"healthcare/hospital",
],
),
(
"Health",
"Clinic",
"🩺",
[
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
],
),
(
"Health",
"Optician",
"👓",
[
"shop/optician",
"healthcare/optometrist",
"shop/hearing_aids",
"healthcare/audiologist",
],
),
(
"Health",
"Physiotherapy",
"🏃",
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
"healthcare/occupational_therapist",
],
),
(
"Health",
"Counselling & Therapy",
"🧠",
[
"healthcare/counselling",
"healthcare/psychotherapist",
"office/therapist",
],
),
(
"Health",
"Care Home",
"🏠",
[
"amenity/care_home",
"amenity/nursing_home",
"amenity/retirement_home",
"healthcare/hospice",
"healthcare/nursing_home",
"office/home_care",
],
),
(
"Health",
"Medical & Mobility",
"",
[
"shop/medical_supply",
"shop/mobility",
"shop/mobility_scooter",
],
),
(
"Culture",
"Museum",
"🏛️",
[
"tourism/museum",
],
),
(
"Culture",
"Gallery",
"🖼️",
[
"tourism/gallery",
# tourism/artwork (statues, murals, village signs) was 93% of this
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
],
),
(
"Culture",
"Library",
"📚",
[
"amenity/library",
],
),
(
"Culture",
"Place of Worship",
"",
[
"amenity/place_of_worship",
"amenity/monastery",
"building/church",
],
),
(
"Culture",
"Arts Centre",
"🎨",
[
"amenity/arts_centre",
],
),
(
"Culture",
"Zoo",
"🦁",
[
"tourism/zoo",
],
),
(
"Culture",
"Tourist Attraction",
"📸",
[
"tourism/attraction",
"tourism/aquarium",
# amenity/fountain (street furniture) and amenity/courthouse are
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
"🏨",
[
"tourism/hotel",
"tourism/hostel",
"tourism/guest_house",
"tourism/motel",
"tourism/camp_site",
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
# Holiday-let chalets are accommodation, not tourist attractions
# (where they previously sat).
"tourism/chalet",
],
),
(
"Local Businesses",
"Local Business",
"🛠️",
[
# Tradespeople
"craft/builder",
"craft/carpenter",
"craft/electrician",
"craft/electronics_repair",
"craft/floorer",
"craft/gardener",
"craft/glaziery",
"craft/hvac",
"craft/joiner",
"craft/locksmith",
"craft/painter",
"craft/plumber",
"craft/roofer",
"craft/window_construction",
"craft/agricultural_engines",
"craft/atelier",
"craft/beekeeper",
"craft/blacksmith",
"craft/bookbinder",
"craft/boatbuilder",
"craft/caterer",
"craft/carpet_layer",
"craft/clockmaker",
"craft/handicraft",
"craft/jeweller",
"craft/metal_construction",
"craft/photographer",
"craft/photographic_laboratory",
"craft/plasterer",
"craft/pottery",
"craft/printer",
"craft/sawmill",
"craft/scaffolder",
"craft/sculptor",
"craft/signmaker",
"craft/stonemason",
"craft/upholsterer",
"craft/watchmaker",
"craft/yes",
"amenity/workshop",
"shop/glaziery",
"shop/windows",
# Professional offices & estate agents
"shop/estate_agent",
"office/accountant",
"office/architect",
"office/auctioneer",
"office/builder",
"office/construction",
"office/construction_company",
"office/engineer",
"office/estate_agent",
"office/financial",
"office/financial_advisor",
"office/financial_services",
"office/insurance",
"office/lawyer",
"office/mortgage",
"office/property_management",
"office/solicitor",
"office/solicitors",
"office/surveyor",
"office/tax_advisor",
],
),
(
"Local Businesses",
"Offices",
"🏢",
[
"amenity/coworking_space",
"amenity/research_institute",
"office/administrative",
"office/advertising_agency",
"office/association",
"office/charity",
"office/company",
"office/consulting",
"office/courier",
"office/coworking",
"office/design",
"office/diplomatic",
"office/educational_institution",
"office/employment_agency",
"office/energy_supplier",
"office/foundation",
"office/government",
"office/graphic_design",
"office/interior_design",
"office/it",
"office/logistics",
"office/marketing",
"office/moving_company",
"office/newspaper",
"office/ngo",
"office/notary",
"office/political_party",
"office/politician",
"office/publisher",
"office/quango",
"office/recruitment",
"office/religion",
"office/research",
"office/security",
"office/taxi",
"office/telecommunication",
"office/transport",
"office/union",
"office/university",
"office/vacant",
"office/web_design",
"office/yes",
],
),
# ── Other ────────────────────────────────────────────────
(
"Other",
"EV Charging",
"🔌",
[
"amenity/charging_station",
],
),
(
"Other",
"Fuel Station",
"",
[
"amenity/fuel",
],
),
(
"Other",
"Community Centre",
"🤝",
[
"amenity/church_hall",
"amenity/clubhouse",
"amenity/community_centre",
"amenity/community_hall",
"amenity/scout_hall",
"amenity/social_centre",
"amenity/townhall",
],
),
# ── Public transport (OSM supplement to NaPTAN) ──────────
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
# transform() (osm_stops_near_naptan).
(
"Public Transport",
"Bus stop",
"🚏",
[
"public_transport/platform",
],
),
]
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
# These tags are overwhelmingly private/incidental when unnamed: a nameless
# `leisure/garden` is a private residential garden (not a public park), and a
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
# Keeping only named instances stops them inflating Park / Sports Centre counts
# while preserving genuinely public, notable facilities (which carry a name).
REQUIRE_NAME_CATEGORIES = {
"leisure/garden",
"leisure/pitch",
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
# fishing spots; only named public facilities count as a Sports Centre.
"leisure/track",
"leisure/horse_riding",
"leisure/fishing",
}
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
for group, name, emoji, osm_keys in _CATEGORIES
for osm_key in osm_keys
}
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
"Tram & Metro stop": "🚊",
}
COOP_RETAILERS = {
"Allendale Co-operative Society",
"Central England Co-operative",
"Channel Islands Co-operative Society",
"Chelmsford Star Co-operative Society",
"Clydebank Co-operative",
"Coniston Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Langdale Co-operative Society",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Co-operative Group",
"The Radstock Co-operative Society",
"The Southern Co-operative",
}
MIN_GROCERY_CHAIN_LOCATIONS = 5
GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Cook": "COOK",
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
}
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
"Aldi": "Aldi",
"Aldi Local": "Aldi",
"Asda": "Asda",
"Asda Express": "Asda Express",
"Asda Living": "Asda Living",
"Asda PFS": "Asda",
"Asda Supercentre": "Asda Supercentre",
"Asda Supermarket": "Asda Supermarket",
"Asda Superstore": "Asda Superstore",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
"Cooltrader": "Heron Foods",
"Co-op Food": "Co-op",
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Eurospar": "Spar",
"Eurospar PFS": "Spar",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Little Waitrose": "Little Waitrose",
"Little Waitrose Shell": "Little Waitrose",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Marks and Spencer BP": "M&S Food",
"Marks and Spencer Clothing": "M&S Clothing",
"Marks and Spencer Food To Go": "M&S Food",
"Marks and Spencer Food Outlet": "M&S Outlet",
"Marks and Spencer Foodhall": "M&S Food",
"Marks and Spencer Hospital": "M&S Hospital",
"Marks and Spencer MSA": "M&S MSA",
"Marks and Spencer Outlet": "M&S Outlet",
"Marks and Spencer Simply Food": "M&S Food",
"Marks and Spencer Travel SF": "M&S Food",
"Morrisons Daily": "Morrisons Daily",
"Morrisons Select": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsbury's Local": "Sainsbury's Local",
"Sainsburys": "Sainsbury's",
"Sainsburys Local": "Sainsbury's Local",
"Spar": "Spar",
"Spar PFS": "Spar",
"Tesco": "Tesco",
"Tesco Express": "Tesco Express",
"Tesco Express Esso": "Tesco Express",
"Tesco Extra": "Tesco Extra",
"The Co-operative Food": "Co-op",
"The Co-operative Food PFS": "Co-op",
"The Food Warehouse": "The Food Warehouse",
"Waitrose": "Waitrose",
"Waitrose MSA": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
}
def normalize_grocery_retailer(retailer: str | None) -> str:
retailer = strip_or_empty(retailer)
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
if icon_name is not None:
return icon_name
return normalize_grocery_retailer(retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
missing = required - set(grocery_df.columns)
if missing:
raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")
df = (
grocery_df.select(
pl.col("id").cast(pl.String),
pl.col("retailer").cast(pl.String),
pl.col("fascia").cast(pl.String),
pl.col("store_name").cast(pl.String),
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.with_columns(
pl.col("retailer").str.strip_chars(),
pl.col("fascia").str.strip_chars(),
pl.col("store_name").str.strip_chars(),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
if boundary_path is not None and len(df) > 0:
mask = in_england_mask(
boundary_path,
df["lat"].to_numpy(),
df["lng"].to_numpy(),
)
df = df.filter(pl.Series(mask))
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
df = df.with_columns(
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category")
)
eligible_categories = (
df.group_by("category")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("category")
)
df = df.join(eligible_categories, on="category", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.struct(["fascia", "retailer"])
.map_elements(
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
return_dtype=pl.String,
)
.alias("icon_category"),
pl.lit("Groceries").alias("group"),
pl.lit("🛒").alias("emoji"),
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
"Nursery school": "🧸",
"Primary school": "🎒",
"Secondary school": "🏫",
"All-through school": "🏫",
"Sixth form": "📚",
"Further education college": "📚",
"University": "🎓",
"Special school": "🤝",
"School": "🏫",
}
def _school_icon_category_expr() -> pl.Expr:
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
wins for universities, FE colleges and special schools (which span multiple
phases); otherwise phase determines the bucket. For independent and other
non-statutory schools where GIAS leaves phase null, fall back to the
age_range bounds so they still split into the right pill."""
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# gias._format_age_range emits three shapes: "<low><high>" (em-dash),
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
# integer as low and the trailing integer as high, then suppress the wrong
# end for the one-sided shapes so they don't collapse to a single bound.
age = pl.col("age_range")
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
# "up to N": no low bound; "N+": no high bound.
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
.when(pl.col("type_group") == "Special schools")
.then(pl.lit("Special school"))
.when(pl.col("type_group") == "Colleges")
.then(pl.lit("Further education college"))
.when(phase == "nursery")
.then(pl.lit("Nursery school"))
.when(phase.is_in(["primary", "middle deemed primary"]))
.then(pl.lit("Primary school"))
.when(phase.is_in(["secondary", "middle deemed secondary"]))
.then(pl.lit("Secondary school"))
.when(phase == "all-through")
.then(pl.lit("All-through school"))
.when(phase.is_in(["16 plus", "sixth form"]))
.then(pl.lit("Sixth form"))
# Age-range fallback for null-phase rows (≈3k Independents + Academies
# GIAS doesn't classify by phase).
.when(max_age <= 5)
.then(pl.lit("Nursery school"))
.when(min_age >= 16)
.then(pl.lit("Sixth form"))
.when((min_age <= 6) & (max_age >= 16))
.then(pl.lit("All-through school"))
.when(max_age <= 11)
.then(pl.lit("Primary school"))
.when(min_age >= 10)
.then(pl.lit("Secondary school"))
.otherwise(pl.lit("School"))
)
OFSTED_OEIF_LABELS = {
"1": "Outstanding",
"2": "Good",
"3": "Requires improvement",
"4": "Inadequate",
}
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; when there is no usable graded result
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_catchments: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(grade_col == "2")
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "3")
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(ungraded.str.starts_with("School remains Outstanding"))
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(ungraded.str.starts_with("School remains Good"))
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)
)
return (
pl.scan_parquet(ofsted_path)
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
label.alias("ofsted_rating"),
)
.filter(pl.col("ofsted_rating").is_not_null())
)
def transform_gias_schools(
gias_path: Path, ofsted_path: Path, boundary_path: Path
) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup.
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
England-only Education layer (and depress apparent Ofsted coverage, since
Wales is inspected by Estyn, not Ofsted)."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
schools = (
pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left")
.select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent")
.cast(pl.Float32, strict=False)
.alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
.collect()
)
mask = in_england_mask(
boundary_path,
schools["lat"].to_numpy(),
schools["lng"].to_numpy(),
)
return schools.filter(pl.Series(mask)).lazy()
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
# same physical store and is dropped. Independent corner shops never carry a
# chain brand, so they are kept.
GROCERY_DEDUP_RADIUS_M = 50.0
# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
_GROCERY_TOKEN_ALIASES = {
"cooperative": "coop",
"cooperatives": "coop",
}
def _significant_tokens(name: str | None) -> set[str]:
"""Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
if not name:
return set()
tokens: set[str] = set()
for raw in str(name).lower().split():
token = "".join(ch for ch in raw if ch.isalnum())
if len(token) >= 3:
tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
return tokens
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
# so the colocated OSM platform is dropped to avoid double-counting; OSM
# platforms with no nearby NaPTAN stop (the gaps) are kept.
BUS_STOP_DEDUP_RADIUS_M = 50.0
def osm_stops_near_naptan(
osm_stops: pl.DataFrame,
naptan_stops: pl.DataFrame,
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
"""
if osm_stops.is_empty() or naptan_stops.is_empty():
return []
from scipy.spatial import cKDTree
n_lat = naptan_stops["lat"].to_numpy().astype(float)
n_lng = naptan_stops["lng"].to_numpy().astype(float)
o_lat = osm_stops["lat"].to_numpy().astype(float)
o_lng = osm_stops["lng"].to_numpy().astype(float)
o_ids = osm_stops["id"].to_list()
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
tree = cKDTree(n_xy)
dist, _ = tree.query(o_xy, k=1)
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame,
radius_m: float = GROCERY_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM grocery ids that duplicate a GEOLYTIX store.
An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
"Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
match, so they are conservatively kept rather than risk a false drop.
``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
"""
if osm_groceries.is_empty() or geolytix.is_empty():
return []
from scipy.spatial import cKDTree
glx_lat = geolytix["lat"].to_numpy().astype(float)
glx_lng = geolytix["lng"].to_numpy().astype(float)
glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
osm_lat = osm_groceries["lat"].to_numpy().astype(float)
osm_lng = osm_groceries["lng"].to_numpy().astype(float)
osm_ids = osm_groceries["id"].to_list()
osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
# Equirectangular projection to metres around the shared mean latitude — at
# England's scale this is accurate to well under the dedup radius.
mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
tree = cKDTree(glx_xy)
neighbours = tree.query_ball_point(osm_xy, r=radius_m)
drop_ids: list[str] = []
for osm_idx, glx_indices in enumerate(neighbours):
tokens = osm_name_tokens[osm_idx]
if not tokens:
continue
for glx_idx in glx_indices:
brand = glx_brand_tokens[glx_idx]
if brand and brand.issubset(tokens):
drop_ids.append(osm_ids[osm_idx])
break
return drop_ids
def transform(
input_path: Path,
naptan_path: Path,
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
ofsted_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = (
lf.select("category").unique().collect(engine="streaming").to_series().to_list()
)
# Warn about (and ignore) any category lacking a mapping
unmapped = []
for cat in all_categories:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
print(f"Ignoring categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
print(
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
)
# Drop unwanted and unmapped categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES) + unmapped))
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
# while `category` still holds the raw OSM key, before the friendly mapping.
lf = lf.filter(
~(
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
& (
pl.col("name").is_null()
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
)
)
)
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}
# Check no friendly names or emojis are empty (defensive)
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
if missing_names:
raise ValueError(f"Empty friendly names for: {missing_names}")
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
if missing_emojis:
raise ValueError(f"Empty emojis for: {missing_emojis}")
lf = lf.with_columns(
pl.col("category").replace_strict(group_mapping).alias("group"),
pl.col("category").replace_strict(name_mapping).alias("category"),
pl.col("category").replace_strict(name_mapping).alias("icon_category"),
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
# A single OSM object can carry several tag keys that map to the same
# friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
# which pois.py emits as multiple raw rows sharing one id. Collapse those
# duplicates so they don't inflate downstream proximity counts; rows sharing
# an id with DIFFERENT categories are preserved. Other sources are
# pre-deduplicated.
lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
naptan_df = pl.scan_parquet(naptan_path).collect()
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
pl.col("category").alias("icon_category"),
)
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
# Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
# colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
osm_groceries = (
lf.filter(pl.col("group") == "Groceries")
.select("id", "name", "lat", "lng")
.collect(engine="streaming")
)
duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
if duplicate_ids:
print(
f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
"GEOLYTIX store"
)
# Scope the drop to the Groceries group: a single OSM object can also
# carry a non-grocery aspect (e.g. a convenience store that is also a
# Post Office), which must survive — only its duplicate grocery row goes.
lf = lf.filter(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
)
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
# with NaPTAN ATCO ids.
osm_bus_stops = (
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
.select("id", "lat", "lng")
.collect(engine="streaming")
)
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
print(
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
f"{kept_osm:,} to fill NaPTAN gaps"
)
if covered_bus_ids:
lf = lf.filter(
~(
(pl.col("group") == "Public Transport")
& (pl.col("category") == "Bus stop")
& pl.col("id").is_in(covered_bus_ids)
)
)
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path, boundary_path),
]
return pl.concat(frames, how="diagonal_relaxed")
def main():
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--grocery-retail-points",
type=Path,
required=True,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--gias",
type=Path,
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--ofsted",
type=Path,
required=True,
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(
args.input,
args.naptan,
args.boundary,
args.grocery_retail_points,
args.gias,
args.ofsted,
).collect(engine="streaming")
df.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
print(f"\nCategories ({df['category'].n_unique()}):")
counts = (
df.group_by("group", "category", "emoji").len().sort("len", descending=True)
)
for row in counts.iter_rows(named=True):
print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")
if __name__ == "__main__":
main()