1601 lines
43 KiB
Python
1601 lines
43 KiB
Python
import argparse
|
||
from pathlib import Path
|
||
|
||
import polars as pl
|
||
|
||
from pipeline.utils.england_geometry import in_england_mask
|
||
|
||
DROP_CATEGORIES = {
|
||
# Street furniture & infrastructure
|
||
"amenity/advice",
|
||
"amenity/atm",
|
||
"amenity/bbq",
|
||
"amenity/bench",
|
||
"amenity/bicycle_parking",
|
||
"amenity/binoculars",
|
||
"amenity/boot_scraper",
|
||
"amenity/bus_garage",
|
||
"amenity/check_in",
|
||
"amenity/clock",
|
||
"amenity/clothes_dryer",
|
||
"amenity/coast_guard",
|
||
"amenity/coffin_rest",
|
||
"amenity/compressed_air",
|
||
"amenity/court_yard",
|
||
"amenity/donation_box",
|
||
"amenity/dressing_room",
|
||
"amenity/drinking_water",
|
||
"emergency/water_tank",
|
||
"leisure/bleachers",
|
||
"leisure/schoolyard",
|
||
"public_transport/pay_scale_area",
|
||
"shop/taxi",
|
||
"amenity/feeding_place",
|
||
"amenity/fixme",
|
||
"amenity/grit_bin",
|
||
"amenity/hunting_stand",
|
||
"amenity/letter_box",
|
||
"amenity/loading_dock",
|
||
"amenity/lounge",
|
||
"tourism/preserved_railway",
|
||
"amenity/lounger",
|
||
"leisure/sport",
|
||
"amenity/motorcycle_parking",
|
||
"amenity/mounting_block",
|
||
"amenity/notice_board",
|
||
"amenity/parcel_locker",
|
||
"amenity/parking",
|
||
"amenity/parking_entrance",
|
||
"amenity/parking_space",
|
||
"amenity/payment_terminal",
|
||
"amenity/photo_booth",
|
||
"amenity/piano",
|
||
"amenity/post_box",
|
||
"amenity/public_bookcase",
|
||
"amenity/reception_desk",
|
||
"amenity/sanitary_dump_station",
|
||
"amenity/shelter",
|
||
"amenity/shower",
|
||
"amenity/smoking_area",
|
||
"amenity/table",
|
||
"amenity/telephone",
|
||
"amenity/telescope",
|
||
"amenity/ticket_validator",
|
||
"amenity/toilets",
|
||
"amenity/trolley_bay",
|
||
"amenity/vacuum_cleaner",
|
||
"amenity/vending_machine",
|
||
"amenity/washing_machine",
|
||
"amenity/washingline",
|
||
"amenity/waste_basket",
|
||
"amenity/waste_disposal",
|
||
"amenity/waste_transfer_station",
|
||
"amenity/water_point",
|
||
"amenity/watering_place",
|
||
"amenity/weighbridge",
|
||
# Niche amenities not useful for home buyers
|
||
"amenity/animal_boarding",
|
||
"amenity/animal_breeding",
|
||
"amenity/animal_shelter",
|
||
"amenity/boat_storage",
|
||
"amenity/bureau_de_change",
|
||
"amenity/bus_station",
|
||
"amenity/beachhut",
|
||
"amenity/canteen",
|
||
"amenity/conference_centre",
|
||
"amenity/crematorium",
|
||
"amenity/disused",
|
||
"amenity/driver_training",
|
||
"amenity/driving_school",
|
||
"amenity/escooter_rental",
|
||
"amenity/ferry_terminal",
|
||
"amenity/grave_yard",
|
||
"amenity/hall",
|
||
"shop/funeral_directors",
|
||
"amenity/kick-scooter_rental",
|
||
"amenity/money_transfer",
|
||
"amenity/post_depot",
|
||
"amenity/prison",
|
||
"amenity/public_building",
|
||
"amenity/recycling",
|
||
"amenity/scout_hut",
|
||
"amenity/social_facility",
|
||
"amenity/studio",
|
||
"amenity/student_accommodation",
|
||
"amenity/taxi",
|
||
"amenity/telephone_exchange",
|
||
"amenity/training",
|
||
"amenity/vehicle_inspection",
|
||
"amenity/waiting_room",
|
||
"amenity/yes",
|
||
"shop/disused",
|
||
"shop/no",
|
||
# Buildings (except church & university which are mapped)
|
||
"building/air_shaft",
|
||
"building/apartments",
|
||
"building/barn",
|
||
"building/bunker",
|
||
"building/chapel",
|
||
"building/commercial",
|
||
"building/construction",
|
||
"building/detached",
|
||
"building/entrance",
|
||
"building/entry",
|
||
"building/farm",
|
||
"building/farm_auxiliary",
|
||
"building/garage",
|
||
"building/garages",
|
||
"building/greenhouse",
|
||
"building/house",
|
||
"building/hut",
|
||
"building/industrial",
|
||
"building/kiosk",
|
||
"building/no",
|
||
"building/office",
|
||
"building/public",
|
||
"building/residential",
|
||
"building/retail",
|
||
"building/roof",
|
||
"building/ruins",
|
||
"building/school",
|
||
"building/semidetached_house",
|
||
"building/service",
|
||
"building/shed",
|
||
"building/terrace",
|
||
"building/warehouse",
|
||
"building/yes",
|
||
# All emergency
|
||
"emergency/access_point",
|
||
"emergency/assembly_point",
|
||
"emergency/bleed_control_kit",
|
||
"emergency/defibrillator",
|
||
"emergency/designated",
|
||
"emergency/dry_riser_inlet",
|
||
"emergency/emergency_ward_entrance",
|
||
"emergency/fire_alarm_box",
|
||
"emergency/fire_extinguisher",
|
||
"emergency/fire_hydrant",
|
||
"emergency/fire_service_inlet",
|
||
"emergency/first_aid_kit",
|
||
"emergency/life_ring",
|
||
"emergency/lifeguard",
|
||
"emergency/no",
|
||
"emergency/phone",
|
||
"emergency/rescue_equipment",
|
||
"emergency/siren",
|
||
"emergency/throw_bag",
|
||
"emergency/water_rescue",
|
||
"emergency/yes",
|
||
"tourism/apartment",
|
||
"tourism/apartments",
|
||
"tourism/alpine_hut",
|
||
"tourism/camp_pitch",
|
||
"tourism/caravan_site",
|
||
"tourism/information",
|
||
"tourism/picnic_site",
|
||
"tourism/viewpoint",
|
||
"tourism/village_sign",
|
||
"tourism/wilderness_hut",
|
||
"tourism/yes",
|
||
# Public transport (from NaPTAN instead)
|
||
"public_transport/entrance",
|
||
"public_transport/platform",
|
||
"public_transport/station",
|
||
"public_transport/stop_position",
|
||
# Education amenities — schools come from GIAS instead. OSM coverage for
|
||
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
|
||
# useful on a property-search map.
|
||
"amenity/school",
|
||
"amenity/prep_school",
|
||
"amenity/language_school",
|
||
"amenity/music_school",
|
||
"amenity/university",
|
||
"amenity/college",
|
||
"building/university",
|
||
"amenity/kindergarten",
|
||
"amenity/childcare",
|
||
"office/tutoring",
|
||
}
|
||
|
||
|
||
# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
|
||
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
|
||
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
||
(
|
||
"Leisure",
|
||
"Café",
|
||
"☕",
|
||
[
|
||
"amenity/cafe",
|
||
"amenity/ice_cream",
|
||
"amenity/internet_cafe",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Restaurant",
|
||
"🍽️",
|
||
[
|
||
"amenity/restaurant",
|
||
"amenity/food_court",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Pub",
|
||
"🍺",
|
||
[
|
||
"amenity/pub",
|
||
"amenity/beer_garden",
|
||
"amenity/biergarten",
|
||
"amenity/social_club",
|
||
"amenity/club",
|
||
"leisure/social_club",
|
||
"craft/brewery",
|
||
"craft/distillery",
|
||
"craft/winery",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Bar",
|
||
"🍸",
|
||
[
|
||
"amenity/bar",
|
||
"amenity/hookah_lounge",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Fast Food",
|
||
"🍔",
|
||
[
|
||
"amenity/fast_food",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Nightclub",
|
||
"🪩",
|
||
[
|
||
"amenity/nightclub",
|
||
"amenity/stripclub",
|
||
"amenity/casino",
|
||
"amenity/gambling",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Cinema",
|
||
"🎬",
|
||
[
|
||
"amenity/cinema",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Theatre",
|
||
"🎭",
|
||
[
|
||
"amenity/theatre",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Live Music & Events",
|
||
"🎶",
|
||
[
|
||
"amenity/music_venue",
|
||
"amenity/events_venue",
|
||
"leisure/dance",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Park",
|
||
"🌳",
|
||
[
|
||
"leisure/park",
|
||
"leisure/garden",
|
||
"leisure/common",
|
||
"leisure/nature_reserve",
|
||
"leisure/dog_park",
|
||
"leisure/bandstand",
|
||
"leisure/bird_hide",
|
||
"leisure/firepit",
|
||
"leisure/outdoor_seating",
|
||
"leisure/picnic_table",
|
||
"leisure/wildlife_hide",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Playground",
|
||
"🛝",
|
||
[
|
||
"leisure/playground",
|
||
"leisure/indoor_play",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Sports Centre",
|
||
"🏟️",
|
||
[
|
||
"leisure/sports_centre",
|
||
"leisure/sports_hall",
|
||
"leisure/pitch",
|
||
"leisure/track",
|
||
"leisure/golf_course",
|
||
"leisure/miniature_golf",
|
||
"leisure/horse_riding",
|
||
"leisure/fishing",
|
||
"leisure/ice_rink",
|
||
"leisure/paddling_pool",
|
||
"leisure/practice_pitch",
|
||
"leisure/shooting_ground",
|
||
"leisure/stadium",
|
||
"leisure/swimming_pool",
|
||
"leisure/swimming_area",
|
||
"leisure/water_park",
|
||
"leisure/bathing_place",
|
||
],
|
||
),
|
||
(
|
||
"Leisure",
|
||
"Entertainment",
|
||
"🎳",
|
||
[
|
||
"leisure/bowling_alley",
|
||
"leisure/amusement_arcade",
|
||
"leisure/adult_gaming_centre",
|
||
"leisure/escape_game",
|
||
"leisure/maze",
|
||
"leisure/trampoline_park",
|
||
"leisure/sauna",
|
||
"leisure/tanning_salon",
|
||
"shop/amusements",
|
||
"tourism/theme_park",
|
||
"amenity/bicycle_rental",
|
||
"amenity/boat_rental",
|
||
"leisure/marina",
|
||
"leisure/slipway",
|
||
"leisure/hackerspace",
|
||
"leisure/yes",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Supermarket",
|
||
"🛒",
|
||
[
|
||
"shop/supermarket",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Convenience Store",
|
||
"🏪",
|
||
[
|
||
"shop/convenience",
|
||
"shop/general",
|
||
"shop/kiosk",
|
||
"shop/grocery",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Bakery",
|
||
"🥐",
|
||
[
|
||
"shop/bakery",
|
||
"shop/pastry",
|
||
"craft/bakery",
|
||
"craft/confectionery",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Butcher & Fishmonger",
|
||
"🥩",
|
||
[
|
||
"shop/butcher",
|
||
"shop/seafood",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Greengrocer",
|
||
"🥬",
|
||
[
|
||
"shop/greengrocer",
|
||
"shop/farm",
|
||
"shop/market",
|
||
"amenity/marketplace",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Off-Licence",
|
||
"🍷",
|
||
[
|
||
"shop/alcohol",
|
||
"shop/wine",
|
||
"shop/beverages",
|
||
],
|
||
),
|
||
(
|
||
"Groceries",
|
||
"Deli & Specialty",
|
||
"🧆",
|
||
[
|
||
"shop/deli",
|
||
"shop/cheese",
|
||
"shop/chocolate",
|
||
"shop/coffee",
|
||
"shop/confectionery",
|
||
"shop/dairy",
|
||
"shop/food",
|
||
"shop/frozen_food",
|
||
"shop/health_food",
|
||
"shop/ice_cream",
|
||
"shop/nutrition_supplements",
|
||
"shop/tea",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Fashion & Clothing",
|
||
"👕",
|
||
[
|
||
"shop/clothes",
|
||
"shop/boutique",
|
||
"shop/shoes",
|
||
"shop/accessories",
|
||
"shop/bag",
|
||
"shop/fashion_accessories",
|
||
"shop/jewelry",
|
||
"shop/leather",
|
||
"shop/watches",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Electronics",
|
||
"📱",
|
||
[
|
||
"shop/electronics",
|
||
"shop/mobile_phone",
|
||
"shop/mobile_phone_accessories",
|
||
"shop/computer",
|
||
"shop/appliance",
|
||
"shop/electrical",
|
||
"shop/hifi",
|
||
"shop/vacuum_cleaner",
|
||
"shop/video_games",
|
||
"shop/games",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Charity Shop",
|
||
"❤️",
|
||
[
|
||
"shop/charity",
|
||
"shop/second_hand",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"DIY & Hardware",
|
||
"🔨",
|
||
[
|
||
"shop/doityourself",
|
||
"shop/hardware",
|
||
"shop/builders_merchant",
|
||
"shop/paint",
|
||
"shop/plumbing",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Home & Garden",
|
||
"🪑",
|
||
[
|
||
"shop/furniture",
|
||
"shop/garden_centre",
|
||
"shop/kitchen",
|
||
"shop/bathroom",
|
||
"shop/bathroom_furnishing",
|
||
"shop/bed",
|
||
"shop/carpet",
|
||
"shop/curtain",
|
||
"shop/flooring",
|
||
"shop/fireplace",
|
||
"shop/garden_furniture",
|
||
"shop/groundskeeping",
|
||
"shop/household",
|
||
"shop/household_linen",
|
||
"shop/houseware",
|
||
"shop/homeware",
|
||
"shop/interior_decoration",
|
||
"shop/lighting",
|
||
"shop/kitchenware",
|
||
"shop/window_blind",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Bookshop",
|
||
"📚",
|
||
[
|
||
"shop/books",
|
||
"shop/stationery",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Pet Shop",
|
||
"🐾",
|
||
[
|
||
"shop/pet",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Sports & Outdoor",
|
||
"🏕️",
|
||
[
|
||
"shop/sports",
|
||
"shop/angling",
|
||
"shop/outdoor",
|
||
"shop/bicycle",
|
||
"shop/equestrian",
|
||
"shop/surf",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Newsagent",
|
||
"📰",
|
||
[
|
||
"shop/newsagent",
|
||
"shop/tobacco",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Department Store",
|
||
"🏬",
|
||
[
|
||
"shop/department_store",
|
||
"shop/mall",
|
||
"shop/variety_store",
|
||
"shop/discount",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Gift & Hobby",
|
||
"🎁",
|
||
[
|
||
"shop/gift",
|
||
"shop/florist",
|
||
"shop/toys",
|
||
"shop/craft",
|
||
"shop/candles",
|
||
"shop/party",
|
||
"shop/art",
|
||
"shop/music",
|
||
"shop/musical_instrument",
|
||
"shop/antiques",
|
||
"shop/anime",
|
||
"shop/baby_goods",
|
||
"shop/fabric",
|
||
"shop/haberdashery",
|
||
"shop/hobby",
|
||
"shop/wool",
|
||
"shop/pottery",
|
||
],
|
||
),
|
||
(
|
||
"Shops",
|
||
"Specialist Shop",
|
||
"🏪",
|
||
[
|
||
"shop/agrarian",
|
||
"shop/boat",
|
||
"shop/bookmaker",
|
||
"shop/building_materials",
|
||
"shop/camera",
|
||
"shop/cannabis",
|
||
"shop/car",
|
||
"shop/caravan",
|
||
"shop/catalogue",
|
||
"shop/auction",
|
||
"shop/auction_house",
|
||
"shop/chandler",
|
||
"shop/collector",
|
||
"shop/copyshop",
|
||
"shop/country_store",
|
||
"shop/doors",
|
||
"shop/e-cigarette",
|
||
"shop/erotic",
|
||
"shop/esoteric",
|
||
"shop/fan",
|
||
"shop/fireworks",
|
||
"shop/fishing",
|
||
"shop/frame",
|
||
"shop/fuel",
|
||
"shop/gas",
|
||
"shop/hairdresser_supply",
|
||
"shop/military_surplus",
|
||
"shop/model",
|
||
"shop/money_lender",
|
||
"shop/motorcycle",
|
||
"shop/outpost",
|
||
"shop/pawnbroker",
|
||
"shop/photo",
|
||
"shop/photo_studio",
|
||
"shop/plant_hire",
|
||
"shop/printer_ink",
|
||
"shop/printing",
|
||
"shop/psychic",
|
||
"shop/pyrotechnics",
|
||
"shop/religion",
|
||
"shop/rental",
|
||
"shop/scuba_diving",
|
||
"shop/security",
|
||
"shop/sewing",
|
||
"shop/ship_chandler",
|
||
"shop/signs",
|
||
"shop/storage_rental",
|
||
"shop/swimming_pool",
|
||
"shop/telecommunication",
|
||
"shop/ticket",
|
||
"shop/tiles",
|
||
"shop/tool_hire",
|
||
"shop/trade",
|
||
"shop/trophy",
|
||
"shop/truck",
|
||
"shop/vacant",
|
||
"shop/van",
|
||
"shop/video",
|
||
"shop/water_sports",
|
||
"shop/weapons",
|
||
"shop/wedding",
|
||
"shop/wholesale",
|
||
"shop/wigs",
|
||
"shop/yes",
|
||
],
|
||
),
|
||
# ── Services ─────────────────────────────────────────────
|
||
(
|
||
"Services",
|
||
"Hairdresser & Beauty",
|
||
"💇",
|
||
[
|
||
"shop/hairdresser",
|
||
"shop/beauty",
|
||
"shop/cosmetics",
|
||
"shop/massage",
|
||
"shop/perfumery",
|
||
"leisure/spa",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Gym & Fitness",
|
||
"🏋️",
|
||
[
|
||
"leisure/fitness_centre",
|
||
"leisure/fitness_station",
|
||
"amenity/dojo",
|
||
"amenity/dancing_school",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Dry Cleaner & Laundry",
|
||
"👔",
|
||
[
|
||
"shop/dry_cleaning",
|
||
"shop/laundry",
|
||
"shop/tailor",
|
||
"shop/shoe_repair",
|
||
"shop/repair",
|
||
"craft/cleaning",
|
||
"craft/dressmaker",
|
||
"craft/shoemaker",
|
||
"craft/tailor",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Car Services",
|
||
"🔧",
|
||
[
|
||
"shop/car_repair",
|
||
"shop/car;car_repair",
|
||
"shop/car_parts",
|
||
"shop/motorcycle_repair",
|
||
"shop/tyres",
|
||
"amenity/car_wash",
|
||
"amenity/car_rental",
|
||
"amenity/car_sharing",
|
||
"amenity/bicycle_repair_station",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Post Office",
|
||
"🏤",
|
||
[
|
||
"amenity/post_office",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Vet & Pet Care",
|
||
"🐕",
|
||
[
|
||
"amenity/veterinary",
|
||
"shop/pet_grooming",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Bank",
|
||
"🏦",
|
||
[
|
||
"amenity/bank",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Travel Agent",
|
||
"✈️",
|
||
[
|
||
"shop/travel_agency",
|
||
"office/travel_agent",
|
||
],
|
||
),
|
||
(
|
||
"Services",
|
||
"Other",
|
||
"🛎️",
|
||
[
|
||
"shop/tattoo",
|
||
"shop/piercing",
|
||
"shop/locksmith",
|
||
"craft/key_cutter",
|
||
],
|
||
),
|
||
(
|
||
"Emergency Services",
|
||
"Police",
|
||
"👮",
|
||
["amenity/police"],
|
||
),
|
||
(
|
||
"Emergency Services",
|
||
"Fire Station",
|
||
"🚒",
|
||
["amenity/fire_station"],
|
||
),
|
||
(
|
||
"Emergency Services",
|
||
"Ambulance Station",
|
||
"🚑",
|
||
["emergency/ambulance_station"],
|
||
),
|
||
(
|
||
"Health",
|
||
"GP Surgery",
|
||
"👨⚕️",
|
||
[
|
||
"amenity/doctors",
|
||
"healthcare/doctor",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Dentist",
|
||
"🦷",
|
||
[
|
||
"amenity/dentist",
|
||
"healthcare/dentist",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Pharmacy",
|
||
"💊",
|
||
[
|
||
"amenity/pharmacy",
|
||
"healthcare/pharmacy",
|
||
"shop/chemist",
|
||
"shop/herbalist",
|
||
"shop/health",
|
||
"healthcare/alternative",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Hospital & Clinic",
|
||
"🏥",
|
||
[
|
||
"amenity/hospital",
|
||
"amenity/clinic",
|
||
"amenity/health_centre",
|
||
"healthcare/blood_donation",
|
||
"healthcare/hospital",
|
||
"healthcare/centre",
|
||
"healthcare/clinic",
|
||
"office/healthcare",
|
||
"healthcare/laboratory",
|
||
"healthcare/rehabilitation",
|
||
"healthcare/vaccination_centre",
|
||
"healthcare/yes",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Optician",
|
||
"👓",
|
||
[
|
||
"shop/optician",
|
||
"healthcare/optometrist",
|
||
"shop/hearing_aids",
|
||
"healthcare/audiologist",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Physiotherapy",
|
||
"🏃",
|
||
[
|
||
"healthcare/physiotherapist",
|
||
"healthcare/podiatrist",
|
||
"healthcare/occupational_therapist",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Counselling & Therapy",
|
||
"🧠",
|
||
[
|
||
"healthcare/counselling",
|
||
"healthcare/psychotherapist",
|
||
"office/therapist",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Care Home",
|
||
"🏠",
|
||
[
|
||
"amenity/care_home",
|
||
"amenity/nursing_home",
|
||
"amenity/retirement_home",
|
||
"healthcare/hospice",
|
||
"healthcare/nursing_home",
|
||
"office/home_care",
|
||
],
|
||
),
|
||
(
|
||
"Health",
|
||
"Medical & Mobility",
|
||
"♿",
|
||
[
|
||
"shop/medical_supply",
|
||
"shop/mobility",
|
||
"shop/mobility_scooter",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Museum",
|
||
"🏛️",
|
||
[
|
||
"tourism/museum",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Gallery",
|
||
"🖼️",
|
||
[
|
||
"tourism/gallery",
|
||
"tourism/artwork",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Library",
|
||
"📚",
|
||
[
|
||
"amenity/library",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Place of Worship",
|
||
"⛪",
|
||
[
|
||
"amenity/place_of_worship",
|
||
"amenity/monastery",
|
||
"building/church",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Arts Centre",
|
||
"🎨",
|
||
[
|
||
"amenity/arts_centre",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Zoo",
|
||
"🦁",
|
||
[
|
||
"tourism/zoo",
|
||
],
|
||
),
|
||
(
|
||
"Culture",
|
||
"Tourist Attraction",
|
||
"📸",
|
||
[
|
||
"tourism/attraction",
|
||
"tourism/aquarium",
|
||
"amenity/fountain",
|
||
"amenity/courthouse",
|
||
"tourism/chalet",
|
||
],
|
||
),
|
||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||
# Niche/tertiary education amenities that GIAS does not cover are dropped
|
||
# rather than mixed in with state-funded schools.
|
||
|
||
(
|
||
"Local Businesses",
|
||
"Hotel",
|
||
"🏨",
|
||
[
|
||
"tourism/hotel",
|
||
"tourism/hostel",
|
||
"tourism/guest_house",
|
||
"tourism/motel",
|
||
"tourism/camp_site",
|
||
"leisure/resort",
|
||
"tourism/holiday_park",
|
||
"tourism/self_catering",
|
||
],
|
||
),
|
||
(
|
||
"Local Businesses",
|
||
"Local Business",
|
||
"🛠️",
|
||
[
|
||
# Tradespeople
|
||
"craft/builder",
|
||
"craft/carpenter",
|
||
"craft/electrician",
|
||
"craft/electronics_repair",
|
||
"craft/floorer",
|
||
"craft/gardener",
|
||
"craft/glaziery",
|
||
"craft/hvac",
|
||
"craft/joiner",
|
||
"craft/locksmith",
|
||
"craft/painter",
|
||
"craft/plumber",
|
||
"craft/roofer",
|
||
"craft/window_construction",
|
||
"craft/agricultural_engines",
|
||
"craft/atelier",
|
||
"craft/beekeeper",
|
||
"craft/blacksmith",
|
||
"craft/bookbinder",
|
||
"craft/boatbuilder",
|
||
"craft/caterer",
|
||
"craft/carpet_layer",
|
||
"craft/clockmaker",
|
||
"craft/handicraft",
|
||
"craft/jeweller",
|
||
"craft/metal_construction",
|
||
"craft/photographer",
|
||
"craft/photographic_laboratory",
|
||
"craft/plasterer",
|
||
"craft/pottery",
|
||
"craft/printer",
|
||
"craft/sawmill",
|
||
"craft/scaffolder",
|
||
"craft/sculptor",
|
||
"craft/signmaker",
|
||
"craft/stonemason",
|
||
"craft/upholsterer",
|
||
"craft/watchmaker",
|
||
"craft/yes",
|
||
"amenity/workshop",
|
||
"shop/glaziery",
|
||
"shop/windows",
|
||
# Professional offices & estate agents
|
||
"shop/estate_agent",
|
||
"office/accountant",
|
||
"office/architect",
|
||
"office/auctioneer",
|
||
"office/builder",
|
||
"office/construction",
|
||
"office/construction_company",
|
||
"office/engineer",
|
||
"office/estate_agent",
|
||
"office/financial",
|
||
"office/financial_advisor",
|
||
"office/financial_services",
|
||
"office/insurance",
|
||
"office/lawyer",
|
||
"office/mortgage",
|
||
"office/property_management",
|
||
"office/solicitor",
|
||
"office/solicitors",
|
||
"office/surveyor",
|
||
"office/tax_advisor",
|
||
],
|
||
),
|
||
(
|
||
"Local Businesses",
|
||
"Offices",
|
||
"🏢",
|
||
[
|
||
"amenity/coworking_space",
|
||
"amenity/research_institute",
|
||
"office/administrative",
|
||
"office/advertising_agency",
|
||
"office/association",
|
||
"office/charity",
|
||
"office/company",
|
||
"office/consulting",
|
||
"office/courier",
|
||
"office/coworking",
|
||
"office/design",
|
||
"office/diplomatic",
|
||
"office/educational_institution",
|
||
"office/employment_agency",
|
||
"office/energy_supplier",
|
||
"office/foundation",
|
||
"office/government",
|
||
"office/graphic_design",
|
||
"office/interior_design",
|
||
"office/it",
|
||
"office/logistics",
|
||
"office/marketing",
|
||
"office/moving_company",
|
||
"office/newspaper",
|
||
"office/ngo",
|
||
"office/notary",
|
||
"office/political_party",
|
||
"office/politician",
|
||
"office/publisher",
|
||
"office/quango",
|
||
"office/recruitment",
|
||
"office/religion",
|
||
"office/research",
|
||
"office/security",
|
||
"office/taxi",
|
||
"office/telecommunication",
|
||
"office/transport",
|
||
"office/union",
|
||
"office/university",
|
||
"office/vacant",
|
||
"office/web_design",
|
||
"office/yes",
|
||
],
|
||
),
|
||
# ── Other ────────────────────────────────────────────────
|
||
(
|
||
"Other",
|
||
"EV Charging",
|
||
"🔌",
|
||
[
|
||
"amenity/charging_station",
|
||
],
|
||
),
|
||
(
|
||
"Other",
|
||
"Fuel Station",
|
||
"⛽",
|
||
[
|
||
"amenity/fuel",
|
||
],
|
||
),
|
||
(
|
||
"Other",
|
||
"Community Centre",
|
||
"🤝",
|
||
[
|
||
"amenity/church_hall",
|
||
"amenity/clubhouse",
|
||
"amenity/community_centre",
|
||
"amenity/community_hall",
|
||
"amenity/scout_hall",
|
||
"amenity/social_centre",
|
||
"amenity/townhall",
|
||
],
|
||
),
|
||
]
|
||
|
||
# Build flat lookup: OSM category → (group, friendly_name, emoji)
|
||
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
|
||
osm_key: (group, name, emoji)
|
||
for group, name, emoji, osm_keys in _CATEGORIES
|
||
for osm_key in osm_keys
|
||
}
|
||
|
||
|
||
NAPTAN_EMOJIS: dict[str, str] = {
|
||
"Airport": "✈️",
|
||
"Ferry": "⛴️",
|
||
"Rail station": "🚆",
|
||
"Bus stop": "🚏",
|
||
"Bus station": "🚌",
|
||
"Taxi rank": "🚕",
|
||
"Tube station": "🚇",
|
||
}
|
||
|
||
|
||
COOP_RETAILERS = {
|
||
"Allendale Co-operative Society",
|
||
"Central England Co-operative",
|
||
"Channel Islands Co-operative Society",
|
||
"Chelmsford Star Co-operative Society",
|
||
"Clydebank Co-operative",
|
||
"Coniston Co-operative Society",
|
||
"East of England Co-operative",
|
||
"Heart of England Co-operative",
|
||
"Langdale Co-operative Society",
|
||
"Lincolnshire Co-operative",
|
||
"Midcounties Co-operative",
|
||
"Scottish Midland Co-operative",
|
||
"Tamworth Co-operative Society",
|
||
"The Co-operative Group",
|
||
"The Radstock Co-operative Society",
|
||
"The Southern Co-operative",
|
||
}
|
||
|
||
MIN_GROCERY_CHAIN_LOCATIONS = 5
|
||
|
||
GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
|
||
"Cook": "COOK",
|
||
"Heron": "Heron Foods",
|
||
"Marks and Spencer": "M&S",
|
||
"Sainsburys": "Sainsbury's",
|
||
}
|
||
|
||
|
||
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
||
"Aldi": "Aldi",
|
||
"Aldi Local": "Aldi",
|
||
"Asda": "Asda",
|
||
"Asda Express": "Asda Express",
|
||
"Asda Living": "Asda Living",
|
||
"Asda PFS": "Asda",
|
||
"Asda Supercentre": "Asda Supercentre",
|
||
"Asda Supermarket": "Asda Supermarket",
|
||
"Asda Superstore": "Asda Superstore",
|
||
"Booths": "Booths",
|
||
"Budgens": "Budgens",
|
||
"Centra": "Centra",
|
||
"Cooltrader": "Heron Foods",
|
||
"Co-op Food": "Co-op",
|
||
"Cook": "COOK",
|
||
"Costco": "Costco",
|
||
"Dunnes Stores": "Dunnes Stores",
|
||
"Eurospar": "Spar",
|
||
"Eurospar PFS": "Spar",
|
||
"Farmfoods": "Farmfoods",
|
||
"Heron": "Heron Foods",
|
||
"Iceland": "Iceland",
|
||
"Lidl": "Lidl",
|
||
"Little Waitrose": "Little Waitrose",
|
||
"Little Waitrose Shell": "Little Waitrose",
|
||
"Makro": "Makro",
|
||
"Marks and Spencer": "M&S",
|
||
"Marks and Spencer BP": "M&S Food",
|
||
"Marks and Spencer Clothing": "M&S Clothing",
|
||
"Marks and Spencer Food To Go": "M&S Food",
|
||
"Marks and Spencer Food Outlet": "M&S Outlet",
|
||
"Marks and Spencer Foodhall": "M&S Food",
|
||
"Marks and Spencer Hospital": "M&S Hospital",
|
||
"Marks and Spencer MSA": "M&S MSA",
|
||
"Marks and Spencer Outlet": "M&S Outlet",
|
||
"Marks and Spencer Simply Food": "M&S Food",
|
||
"Marks and Spencer Travel SF": "M&S Food",
|
||
"Morrisons Daily": "Morrisons Daily",
|
||
"Morrisons Select": "Morrisons",
|
||
"Planet Organic": "Planet Organic",
|
||
"Sainsbury's Local": "Sainsbury's Local",
|
||
"Sainsburys": "Sainsbury's",
|
||
"Sainsburys Local": "Sainsbury's Local",
|
||
"Spar": "Spar",
|
||
"Spar PFS": "Spar",
|
||
"Tesco": "Tesco",
|
||
"Tesco Express": "Tesco Express",
|
||
"Tesco Express Esso": "Tesco Express",
|
||
"Tesco Extra": "Tesco Extra",
|
||
"The Co-operative Food": "Co-op",
|
||
"The Co-operative Food PFS": "Co-op",
|
||
"The Food Warehouse": "The Food Warehouse",
|
||
"Waitrose": "Waitrose",
|
||
"Waitrose MSA": "Waitrose",
|
||
"Whole Foods Market": "Whole Foods Market",
|
||
}
|
||
|
||
|
||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||
if retailer is None:
|
||
return ""
|
||
retailer = retailer.strip()
|
||
if retailer in COOP_RETAILERS:
|
||
return "Co-op"
|
||
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
|
||
|
||
|
||
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
|
||
if fascia:
|
||
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
|
||
if icon_name is not None:
|
||
return icon_name
|
||
return normalize_grocery_retailer(retailer)
|
||
|
||
|
||
def transform_grocery_retail_points(
|
||
grocery_df: pl.DataFrame,
|
||
boundary_path: Path | None = None,
|
||
min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
|
||
) -> pl.DataFrame:
|
||
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
|
||
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
|
||
missing = required - set(grocery_df.columns)
|
||
if missing:
|
||
raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")
|
||
|
||
df = (
|
||
grocery_df.select(
|
||
pl.col("id").cast(pl.String),
|
||
pl.col("retailer").cast(pl.String),
|
||
pl.col("fascia").cast(pl.String),
|
||
pl.col("store_name").cast(pl.String),
|
||
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
|
||
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
|
||
)
|
||
.with_columns(
|
||
pl.col("retailer").str.strip_chars(),
|
||
pl.col("fascia").str.strip_chars(),
|
||
pl.col("store_name").str.strip_chars(),
|
||
)
|
||
.drop_nulls(["id", "retailer", "lat", "lng"])
|
||
.filter(pl.col("retailer").str.len_chars() > 0)
|
||
)
|
||
|
||
if boundary_path is not None and len(df) > 0:
|
||
mask = in_england_mask(
|
||
boundary_path,
|
||
df["lat"].to_numpy(),
|
||
df["lng"].to_numpy(),
|
||
)
|
||
df = df.filter(pl.Series(mask))
|
||
|
||
eligible_retailers = (
|
||
df.group_by("retailer")
|
||
.len()
|
||
.filter(pl.col("len") >= min_chain_locations)
|
||
.select("retailer")
|
||
)
|
||
df = df.join(eligible_retailers, on="retailer", how="semi")
|
||
|
||
return df.with_columns(
|
||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||
pl.coalesce(["store_name", "fascia", "retailer"])
|
||
.str.replace_all("''", "'")
|
||
.alias("name"),
|
||
pl.col("retailer")
|
||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||
.alias("category"),
|
||
pl.struct(["fascia", "retailer"])
|
||
.map_elements(
|
||
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
|
||
return_dtype=pl.String,
|
||
)
|
||
.alias("icon_category"),
|
||
pl.lit("Groceries").alias("group"),
|
||
pl.lit("🛒").alias("emoji"),
|
||
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
|
||
|
||
|
||
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
|
||
"Nursery school": "🧸",
|
||
"Primary school": "🎒",
|
||
"Secondary school": "🏫",
|
||
"All-through school": "🏫",
|
||
"Sixth form": "📚",
|
||
"Further education college": "📚",
|
||
"University": "🎓",
|
||
"Special school": "🤝",
|
||
"School": "🏫",
|
||
}
|
||
|
||
|
||
def _school_icon_category_expr() -> pl.Expr:
|
||
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
|
||
wins for universities, FE colleges and special schools (which span multiple
|
||
phases); otherwise phase determines the bucket. For independent and other
|
||
non-statutory schools where GIAS leaves phase null, fall back to the
|
||
age_range bounds so they still split into the right pill."""
|
||
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
||
# primary") so we normalise before matching.
|
||
phase = pl.col("phase").str.to_lowercase()
|
||
# age_range is "<min>–<max>" using an em-dash; both ends may be missing.
|
||
age_parts = pl.col("age_range").str.split_exact("–", 1)
|
||
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
|
||
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
|
||
return (
|
||
pl.when(pl.col("type_group") == "Universities")
|
||
.then(pl.lit("University"))
|
||
.when(pl.col("type_group") == "Special schools")
|
||
.then(pl.lit("Special school"))
|
||
.when(pl.col("type_group") == "Colleges")
|
||
.then(pl.lit("Further education college"))
|
||
.when(phase == "nursery")
|
||
.then(pl.lit("Nursery school"))
|
||
.when(phase.is_in(["primary", "middle deemed primary"]))
|
||
.then(pl.lit("Primary school"))
|
||
.when(phase.is_in(["secondary", "middle deemed secondary"]))
|
||
.then(pl.lit("Secondary school"))
|
||
.when(phase == "all-through")
|
||
.then(pl.lit("All-through school"))
|
||
.when(phase.is_in(["16 plus", "sixth form"]))
|
||
.then(pl.lit("Sixth form"))
|
||
# Age-range fallback for null-phase rows (≈3k Independents + Academies
|
||
# GIAS doesn't classify by phase).
|
||
.when(max_age <= 5)
|
||
.then(pl.lit("Nursery school"))
|
||
.when(min_age >= 16)
|
||
.then(pl.lit("Sixth form"))
|
||
.when((min_age <= 6) & (max_age >= 16))
|
||
.then(pl.lit("All-through school"))
|
||
.when(max_age <= 11)
|
||
.then(pl.lit("Primary school"))
|
||
.when(min_age >= 10)
|
||
.then(pl.lit("Secondary school"))
|
||
.otherwise(pl.lit("School"))
|
||
)
|
||
|
||
|
||
OFSTED_OEIF_LABELS = {
|
||
"1": "Outstanding",
|
||
"2": "Good",
|
||
"3": "Requires improvement",
|
||
"4": "Inadequate",
|
||
}
|
||
|
||
|
||
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
||
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
||
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
|
||
only have a report card) is preserved verbatim; null grades drop out."""
|
||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||
label = (
|
||
pl.when(grade_col == "1")
|
||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||
.when(grade_col == "2")
|
||
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
|
||
.when(grade_col == "3")
|
||
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
||
.when(grade_col == "4")
|
||
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
||
.when(grade_col == "Not judged")
|
||
.then(pl.lit("Not judged"))
|
||
.otherwise(None)
|
||
)
|
||
return (
|
||
pl.scan_parquet(ofsted_path)
|
||
.select(
|
||
pl.col("URN").cast(pl.Int64).alias("urn"),
|
||
label.alias("ofsted_rating"),
|
||
)
|
||
.filter(pl.col("ofsted_rating").is_not_null())
|
||
)
|
||
|
||
|
||
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
||
"""Convert the GIAS register parquet into POI rows with school metadata.
|
||
Ofsted ratings are joined by URN so each school carries its latest OEIF
|
||
overall effectiveness grade (Outstanding/Good/Requires improvement/
|
||
Inadequate/Not judged), surfaced in the map popup."""
|
||
icon_category_expr = _school_icon_category_expr()
|
||
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
|
||
ofsted = _load_ofsted_ratings(ofsted_path)
|
||
# category mirrors icon_category so the dashboard renders one toggle per
|
||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||
# instead of bundling every GIAS row under a single "School" pill.
|
||
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
|
||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||
pl.col("name"),
|
||
icon_category_expr.alias("category"),
|
||
icon_category_expr.alias("icon_category"),
|
||
pl.lit("Education").alias("group"),
|
||
pl.col("lat").cast(pl.Float64),
|
||
pl.col("lng").cast(pl.Float64),
|
||
emoji_expr.alias("emoji"),
|
||
pl.col("phase").alias("school_phase"),
|
||
pl.col("type").alias("school_type"),
|
||
pl.col("type_group").alias("school_type_group"),
|
||
pl.col("age_range").alias("school_age_range"),
|
||
pl.col("gender").alias("school_gender"),
|
||
pl.col("religious_character").alias("school_religious_character"),
|
||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||
pl.col("sixth_form").alias("school_sixth_form"),
|
||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
|
||
pl.col("trust").alias("school_trust"),
|
||
pl.col("address").alias("school_address"),
|
||
pl.col("postcode").alias("school_postcode"),
|
||
pl.col("local_authority").alias("school_local_authority"),
|
||
pl.col("website").alias("school_website"),
|
||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||
pl.col("head_name").alias("school_head_name"),
|
||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||
)
|
||
|
||
|
||
def transform(
|
||
input_path: Path,
|
||
naptan_path: Path,
|
||
boundary_path: Path,
|
||
grocery_retail_points_path: Path,
|
||
gias_path: Path,
|
||
ofsted_path: Path,
|
||
) -> pl.LazyFrame:
|
||
lf = pl.scan_parquet(input_path)
|
||
|
||
# Get all unique categories present in the data
|
||
all_categories = (
|
||
lf.select("category").unique().collect(engine="streaming").to_series().to_list()
|
||
)
|
||
|
||
# Verify every non-dropped category has a mapping
|
||
unmapped = []
|
||
for cat in all_categories:
|
||
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
|
||
unmapped.append(cat)
|
||
if unmapped:
|
||
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||
|
||
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
|
||
mapped_but_absent = []
|
||
all_set = set(all_categories)
|
||
for cat in CATEGORY_MAP:
|
||
if cat not in all_set:
|
||
mapped_but_absent.append(cat)
|
||
if mapped_but_absent:
|
||
print(
|
||
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
|
||
)
|
||
|
||
# Drop unwanted categories
|
||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||
|
||
# Build lookup expressions from the 3-tuple mapping
|
||
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
|
||
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
|
||
emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}
|
||
|
||
# Check no friendly names or emojis are empty (defensive)
|
||
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
|
||
if missing_names:
|
||
raise ValueError(f"Empty friendly names for: {missing_names}")
|
||
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
|
||
if missing_emojis:
|
||
raise ValueError(f"Empty emojis for: {missing_emojis}")
|
||
|
||
lf = lf.with_columns(
|
||
pl.col("category").replace_strict(group_mapping).alias("group"),
|
||
pl.col("category").replace_strict(name_mapping).alias("category"),
|
||
pl.col("category").replace_strict(name_mapping).alias("icon_category"),
|
||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||
)
|
||
|
||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||
mask = in_england_mask(
|
||
boundary_path,
|
||
naptan_df["lat"].to_numpy(),
|
||
naptan_df["lng"].to_numpy(),
|
||
)
|
||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||
naptan = naptan_df.lazy().with_columns(
|
||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||
pl.lit("Public Transport").alias("group"),
|
||
pl.col("category").alias("icon_category"),
|
||
)
|
||
|
||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||
frames = [
|
||
lf,
|
||
naptan,
|
||
grocery_pois.lazy(),
|
||
transform_gias_schools(gias_path, ofsted_path),
|
||
]
|
||
|
||
return pl.concat(frames, how="diagonal_relaxed")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Transform raw POIs to filtered version with friendly names"
|
||
)
|
||
parser.add_argument(
|
||
"--input", type=Path, required=True, help="Raw POIs parquet file"
|
||
)
|
||
parser.add_argument(
|
||
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
||
)
|
||
parser.add_argument(
|
||
"--boundary",
|
||
type=Path,
|
||
required=True,
|
||
help="England boundary GeoJSON file",
|
||
)
|
||
parser.add_argument(
|
||
"--grocery-retail-points",
|
||
type=Path,
|
||
required=True,
|
||
help="GEOLYTIX Grocery Retail Points parquet",
|
||
)
|
||
parser.add_argument(
|
||
"--gias",
|
||
type=Path,
|
||
required=True,
|
||
help="GIAS schools register parquet (replaces OSM schools)",
|
||
)
|
||
parser.add_argument(
|
||
"--ofsted",
|
||
type=Path,
|
||
required=True,
|
||
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
|
||
)
|
||
parser.add_argument(
|
||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
df = transform(
|
||
args.input,
|
||
args.naptan,
|
||
args.boundary,
|
||
args.grocery_retail_points,
|
||
args.gias,
|
||
args.ofsted,
|
||
).collect(engine="streaming")
|
||
|
||
df.write_parquet(args.output)
|
||
|
||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
|
||
print(f"\nCategories ({df['category'].n_unique()}):")
|
||
counts = (
|
||
df.group_by("group", "category", "emoji").len().sort("len", descending=True)
|
||
)
|
||
for row in counts.iter_rows(named=True):
|
||
print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|