import argparse from pathlib import Path import polars as pl from pipeline.utils.england_geometry import in_england_mask DROP_CATEGORIES = { # Street furniture & infrastructure "amenity/advice", "amenity/atm", "amenity/bbq", "amenity/bench", "amenity/bicycle_parking", "amenity/binoculars", "amenity/boot_scraper", "amenity/bus_garage", "amenity/check_in", "amenity/clock", "amenity/clothes_dryer", "amenity/coast_guard", "amenity/coffin_rest", "amenity/compressed_air", "amenity/court_yard", "amenity/donation_box", "amenity/dressing_room", "amenity/drinking_water", "emergency/water_tank", "leisure/bleachers", "leisure/schoolyard", "public_transport/pay_scale_area", "shop/taxi", "amenity/feeding_place", "amenity/fixme", "amenity/grit_bin", "amenity/hunting_stand", "amenity/letter_box", "amenity/loading_dock", "amenity/lounge", "tourism/preserved_railway", "amenity/lounger", "leisure/sport", "amenity/motorcycle_parking", "amenity/mounting_block", "amenity/notice_board", "amenity/parcel_locker", "amenity/parking", "amenity/parking_entrance", "amenity/parking_space", "amenity/payment_terminal", "amenity/photo_booth", "amenity/piano", "amenity/post_box", "amenity/public_bookcase", "amenity/reception_desk", "amenity/sanitary_dump_station", "amenity/shelter", "amenity/shower", "amenity/smoking_area", "amenity/table", "amenity/telephone", "amenity/telescope", "amenity/ticket_validator", "amenity/toilets", "amenity/trolley_bay", "amenity/vacuum_cleaner", "amenity/vending_machine", "amenity/washing_machine", "amenity/washingline", "amenity/waste_basket", "amenity/waste_disposal", "amenity/waste_transfer_station", "amenity/water_point", "amenity/watering_place", "amenity/weighbridge", # Niche amenities not useful for home buyers "amenity/animal_boarding", "amenity/animal_breeding", "amenity/animal_shelter", "amenity/boat_storage", "amenity/bureau_de_change", "amenity/bus_station", "amenity/beachhut", "amenity/canteen", "amenity/conference_centre", "amenity/crematorium", "amenity/disused", "amenity/driver_training", "amenity/driving_school", "amenity/escooter_rental", "amenity/ferry_terminal", "amenity/grave_yard", "amenity/hall", "shop/funeral_directors", "amenity/kick-scooter_rental", "amenity/money_transfer", "amenity/post_depot", "amenity/prison", "amenity/public_building", "amenity/recycling", "amenity/scout_hut", "amenity/social_facility", "amenity/studio", "amenity/student_accommodation", "amenity/taxi", "amenity/telephone_exchange", "amenity/training", "amenity/vehicle_inspection", "amenity/waiting_room", "amenity/yes", "shop/disused", "shop/no", # Buildings (except church & university which are mapped) "building/air_shaft", "building/apartments", "building/barn", "building/bunker", "building/chapel", "building/commercial", "building/construction", "building/detached", "building/entrance", "building/entry", "building/farm", "building/farm_auxiliary", "building/garage", "building/garages", "building/greenhouse", "building/house", "building/hut", "building/industrial", "building/kiosk", "building/no", "building/office", "building/public", "building/residential", "building/retail", "building/roof", "building/ruins", "building/school", "building/semidetached_house", "building/service", "building/shed", "building/terrace", "building/warehouse", "building/yes", # All emergency "emergency/access_point", "emergency/assembly_point", "emergency/bleed_control_kit", "emergency/defibrillator", "emergency/designated", "emergency/dry_riser_inlet", "emergency/emergency_ward_entrance", "emergency/fire_alarm_box", "emergency/fire_extinguisher", "emergency/fire_hydrant", "emergency/fire_service_inlet", "emergency/first_aid_kit", "emergency/life_ring", "emergency/lifeguard", "emergency/no", "emergency/phone", "emergency/rescue_equipment", "emergency/siren", "emergency/throw_bag", "emergency/water_rescue", "emergency/yes", "tourism/apartment", "tourism/apartments", "tourism/alpine_hut", "tourism/camp_pitch", "tourism/caravan_site", "tourism/information", "tourism/picnic_site", "tourism/viewpoint", "tourism/village_sign", "tourism/wilderness_hut", "tourism/yes", # Public transport (from NaPTAN instead) "public_transport/entrance", "public_transport/platform", "public_transport/station", "public_transport/stop_position", # Education amenities — schools come from GIAS instead. OSM coverage for # tertiary education, tutoring, and childcare is too noisy/incomplete to be # useful on a property-search map. "amenity/school", "amenity/prep_school", "amenity/language_school", "amenity/music_school", "amenity/university", "amenity/college", "building/university", "amenity/kindergarten", "amenity/childcare", "office/tutoring", } # Each output category defined once: (group, friendly_name, emoji, [osm_keys...]) # The flat CATEGORY_MAP lookup dict is built from this at the bottom. _CATEGORIES: list[tuple[str, str, str, list[str]]] = [ ( "Leisure", "Café", "☕", [ "amenity/cafe", "amenity/ice_cream", "amenity/internet_cafe", ], ), ( "Leisure", "Restaurant", "🍽️", [ "amenity/restaurant", "amenity/food_court", ], ), ( "Leisure", "Pub", "🍺", [ "amenity/pub", "amenity/beer_garden", "amenity/biergarten", "amenity/social_club", "amenity/club", "leisure/social_club", "craft/brewery", "craft/distillery", "craft/winery", ], ), ( "Leisure", "Bar", "🍸", [ "amenity/bar", "amenity/hookah_lounge", ], ), ( "Leisure", "Fast Food", "🍔", [ "amenity/fast_food", ], ), ( "Leisure", "Nightclub", "🪩", [ "amenity/nightclub", "amenity/stripclub", "amenity/casino", "amenity/gambling", ], ), ( "Leisure", "Cinema", "🎬", [ "amenity/cinema", ], ), ( "Leisure", "Theatre", "🎭", [ "amenity/theatre", ], ), ( "Leisure", "Live Music & Events", "🎶", [ "amenity/music_venue", "amenity/events_venue", "leisure/dance", ], ), ( "Leisure", "Park", "🌳", [ "leisure/park", "leisure/garden", "leisure/common", "leisure/nature_reserve", "leisure/dog_park", "leisure/bandstand", "leisure/bird_hide", "leisure/firepit", "leisure/outdoor_seating", "leisure/picnic_table", "leisure/wildlife_hide", ], ), ( "Leisure", "Playground", "🛝", [ "leisure/playground", "leisure/indoor_play", ], ), ( "Leisure", "Sports Centre", "🏟️", [ "leisure/sports_centre", "leisure/sports_hall", "leisure/pitch", "leisure/track", "leisure/golf_course", "leisure/miniature_golf", "leisure/horse_riding", "leisure/fishing", "leisure/ice_rink", "leisure/paddling_pool", "leisure/practice_pitch", "leisure/shooting_ground", "leisure/stadium", "leisure/swimming_pool", "leisure/swimming_area", "leisure/water_park", "leisure/bathing_place", ], ), ( "Leisure", "Entertainment", "🎳", [ "leisure/bowling_alley", "leisure/amusement_arcade", "leisure/adult_gaming_centre", "leisure/escape_game", "leisure/maze", "leisure/trampoline_park", "leisure/sauna", "leisure/tanning_salon", "shop/amusements", "tourism/theme_park", "amenity/bicycle_rental", "amenity/boat_rental", "leisure/marina", "leisure/slipway", "leisure/hackerspace", "leisure/yes", ], ), ( "Groceries", "Supermarket", "🛒", [ "shop/supermarket", ], ), ( "Groceries", "Convenience Store", "🏪", [ "shop/convenience", "shop/general", "shop/kiosk", "shop/grocery", ], ), ( "Groceries", "Bakery", "🥐", [ "shop/bakery", "shop/pastry", "craft/bakery", "craft/confectionery", ], ), ( "Groceries", "Butcher & Fishmonger", "🥩", [ "shop/butcher", "shop/seafood", ], ), ( "Groceries", "Greengrocer", "🥬", [ "shop/greengrocer", "shop/farm", "shop/market", "amenity/marketplace", ], ), ( "Groceries", "Off-Licence", "🍷", [ "shop/alcohol", "shop/wine", "shop/beverages", ], ), ( "Groceries", "Deli & Specialty", "🧆", [ "shop/deli", "shop/cheese", "shop/chocolate", "shop/coffee", "shop/confectionery", "shop/dairy", "shop/food", "shop/frozen_food", "shop/health_food", "shop/ice_cream", "shop/nutrition_supplements", "shop/tea", ], ), ( "Shops", "Fashion & Clothing", "👕", [ "shop/clothes", "shop/boutique", "shop/shoes", "shop/accessories", "shop/bag", "shop/fashion_accessories", "shop/jewelry", "shop/leather", "shop/watches", ], ), ( "Shops", "Electronics", "📱", [ "shop/electronics", "shop/mobile_phone", "shop/mobile_phone_accessories", "shop/computer", "shop/appliance", "shop/electrical", "shop/hifi", "shop/vacuum_cleaner", "shop/video_games", "shop/games", ], ), ( "Shops", "Charity Shop", "❤️", [ "shop/charity", "shop/second_hand", ], ), ( "Shops", "DIY & Hardware", "🔨", [ "shop/doityourself", "shop/hardware", "shop/builders_merchant", "shop/paint", "shop/plumbing", ], ), ( "Shops", "Home & Garden", "🪑", [ "shop/furniture", "shop/garden_centre", "shop/kitchen", "shop/bathroom", "shop/bathroom_furnishing", "shop/bed", "shop/carpet", "shop/curtain", "shop/flooring", "shop/fireplace", "shop/garden_furniture", "shop/groundskeeping", "shop/household", "shop/household_linen", "shop/houseware", "shop/homeware", "shop/interior_decoration", "shop/lighting", "shop/kitchenware", "shop/window_blind", ], ), ( "Shops", "Bookshop", "📚", [ "shop/books", "shop/stationery", ], ), ( "Shops", "Pet Shop", "🐾", [ "shop/pet", ], ), ( "Shops", "Sports & Outdoor", "🏕️", [ "shop/sports", "shop/angling", "shop/outdoor", "shop/bicycle", "shop/equestrian", "shop/surf", ], ), ( "Shops", "Newsagent", "📰", [ "shop/newsagent", "shop/tobacco", ], ), ( "Shops", "Department Store", "🏬", [ "shop/department_store", "shop/mall", "shop/variety_store", "shop/discount", ], ), ( "Shops", "Gift & Hobby", "🎁", [ "shop/gift", "shop/florist", "shop/toys", "shop/craft", "shop/candles", "shop/party", "shop/art", "shop/music", "shop/musical_instrument", "shop/antiques", "shop/anime", "shop/baby_goods", "shop/fabric", "shop/haberdashery", "shop/hobby", "shop/wool", "shop/pottery", ], ), ( "Shops", "Specialist Shop", "🏪", [ "shop/agrarian", "shop/boat", "shop/bookmaker", "shop/building_materials", "shop/camera", "shop/cannabis", "shop/car", "shop/caravan", "shop/catalogue", "shop/auction", "shop/auction_house", "shop/chandler", "shop/collector", "shop/copyshop", "shop/country_store", "shop/doors", "shop/e-cigarette", "shop/erotic", "shop/esoteric", "shop/fan", "shop/fireworks", "shop/fishing", "shop/frame", "shop/fuel", "shop/gas", "shop/hairdresser_supply", "shop/military_surplus", "shop/model", "shop/money_lender", "shop/motorcycle", "shop/outpost", "shop/pawnbroker", "shop/photo", "shop/photo_studio", "shop/plant_hire", "shop/printer_ink", "shop/printing", "shop/psychic", "shop/pyrotechnics", "shop/religion", "shop/rental", "shop/scuba_diving", "shop/security", "shop/sewing", "shop/ship_chandler", "shop/signs", "shop/storage_rental", "shop/swimming_pool", "shop/telecommunication", "shop/ticket", "shop/tiles", "shop/tool_hire", "shop/trade", "shop/trophy", "shop/truck", "shop/vacant", "shop/van", "shop/video", "shop/water_sports", "shop/weapons", "shop/wedding", "shop/wholesale", "shop/wigs", "shop/yes", ], ), # ── Services ───────────────────────────────────────────── ( "Services", "Hairdresser & Beauty", "💇", [ "shop/hairdresser", "shop/beauty", "shop/cosmetics", "shop/massage", "shop/perfumery", "leisure/spa", ], ), ( "Services", "Gym & Fitness", "🏋️", [ "leisure/fitness_centre", "leisure/fitness_station", "amenity/dojo", "amenity/dancing_school", ], ), ( "Services", "Dry Cleaner & Laundry", "👔", [ "shop/dry_cleaning", "shop/laundry", "shop/tailor", "shop/shoe_repair", "shop/repair", "craft/cleaning", "craft/dressmaker", "craft/shoemaker", "craft/tailor", ], ), ( "Services", "Car Services", "🔧", [ "shop/car_repair", "shop/car;car_repair", "shop/car_parts", "shop/motorcycle_repair", "shop/tyres", "amenity/car_wash", "amenity/car_rental", "amenity/car_sharing", "amenity/bicycle_repair_station", ], ), ( "Services", "Post Office", "🏤", [ "amenity/post_office", ], ), ( "Services", "Vet & Pet Care", "🐕", [ "amenity/veterinary", "shop/pet_grooming", ], ), ( "Services", "Bank", "🏦", [ "amenity/bank", ], ), ( "Services", "Travel Agent", "✈️", [ "shop/travel_agency", "office/travel_agent", ], ), ( "Services", "Other", "🛎️", [ "shop/tattoo", "shop/piercing", "shop/locksmith", "craft/key_cutter", ], ), ( "Emergency Services", "Police", "👮", ["amenity/police"], ), ( "Emergency Services", "Fire Station", "🚒", ["amenity/fire_station"], ), ( "Emergency Services", "Ambulance Station", "🚑", ["emergency/ambulance_station"], ), ( "Health", "GP Surgery", "👨‍⚕️", [ "amenity/doctors", "healthcare/doctor", ], ), ( "Health", "Dentist", "🦷", [ "amenity/dentist", "healthcare/dentist", ], ), ( "Health", "Pharmacy", "💊", [ "amenity/pharmacy", "healthcare/pharmacy", "shop/chemist", "shop/herbalist", "shop/health", "healthcare/alternative", ], ), ( "Health", "Hospital & Clinic", "🏥", [ "amenity/hospital", "amenity/clinic", "amenity/health_centre", "healthcare/blood_donation", "healthcare/hospital", "healthcare/centre", "healthcare/clinic", "office/healthcare", "healthcare/laboratory", "healthcare/rehabilitation", "healthcare/vaccination_centre", "healthcare/yes", ], ), ( "Health", "Optician", "👓", [ "shop/optician", "healthcare/optometrist", "shop/hearing_aids", "healthcare/audiologist", ], ), ( "Health", "Physiotherapy", "🏃", [ "healthcare/physiotherapist", "healthcare/podiatrist", "healthcare/occupational_therapist", ], ), ( "Health", "Counselling & Therapy", "🧠", [ "healthcare/counselling", "healthcare/psychotherapist", "office/therapist", ], ), ( "Health", "Care Home", "🏠", [ "amenity/care_home", "amenity/nursing_home", "amenity/retirement_home", "healthcare/hospice", "healthcare/nursing_home", "office/home_care", ], ), ( "Health", "Medical & Mobility", "♿", [ "shop/medical_supply", "shop/mobility", "shop/mobility_scooter", ], ), ( "Culture", "Museum", "🏛️", [ "tourism/museum", ], ), ( "Culture", "Gallery", "🖼️", [ "tourism/gallery", "tourism/artwork", ], ), ( "Culture", "Library", "📚", [ "amenity/library", ], ), ( "Culture", "Place of Worship", "⛪", [ "amenity/place_of_worship", "amenity/monastery", "building/church", ], ), ( "Culture", "Arts Centre", "🎨", [ "amenity/arts_centre", ], ), ( "Culture", "Zoo", "🦁", [ "tourism/zoo", ], ), ( "Culture", "Tourist Attraction", "📸", [ "tourism/attraction", "tourism/aquarium", "amenity/fountain", "amenity/courthouse", "tourism/chalet", ], ), # Note: schools come from the GIAS register (see transform_gias_schools). # Niche/tertiary education amenities that GIAS does not cover are dropped # rather than mixed in with state-funded schools. ( "Local Businesses", "Hotel", "🏨", [ "tourism/hotel", "tourism/hostel", "tourism/guest_house", "tourism/motel", "tourism/camp_site", "leisure/resort", "tourism/holiday_park", "tourism/self_catering", ], ), ( "Local Businesses", "Local Business", "🛠️", [ # Tradespeople "craft/builder", "craft/carpenter", "craft/electrician", "craft/electronics_repair", "craft/floorer", "craft/gardener", "craft/glaziery", "craft/hvac", "craft/joiner", "craft/locksmith", "craft/painter", "craft/plumber", "craft/roofer", "craft/window_construction", "craft/agricultural_engines", "craft/atelier", "craft/beekeeper", "craft/blacksmith", "craft/bookbinder", "craft/boatbuilder", "craft/caterer", "craft/carpet_layer", "craft/clockmaker", "craft/handicraft", "craft/jeweller", "craft/metal_construction", "craft/photographer", "craft/photographic_laboratory", "craft/plasterer", "craft/pottery", "craft/printer", "craft/sawmill", "craft/scaffolder", "craft/sculptor", "craft/signmaker", "craft/stonemason", "craft/upholsterer", "craft/watchmaker", "craft/yes", "amenity/workshop", "shop/glaziery", "shop/windows", # Professional offices & estate agents "shop/estate_agent", "office/accountant", "office/architect", "office/auctioneer", "office/builder", "office/construction", "office/construction_company", "office/engineer", "office/estate_agent", "office/financial", "office/financial_advisor", "office/financial_services", "office/insurance", "office/lawyer", "office/mortgage", "office/property_management", "office/solicitor", "office/solicitors", "office/surveyor", "office/tax_advisor", ], ), ( "Local Businesses", "Offices", "🏢", [ "amenity/coworking_space", "amenity/research_institute", "office/administrative", "office/advertising_agency", "office/association", "office/charity", "office/company", "office/consulting", "office/courier", "office/coworking", "office/design", "office/diplomatic", "office/educational_institution", "office/employment_agency", "office/energy_supplier", "office/foundation", "office/government", "office/graphic_design", "office/interior_design", "office/it", "office/logistics", "office/marketing", "office/moving_company", "office/newspaper", "office/ngo", "office/notary", "office/political_party", "office/politician", "office/publisher", "office/quango", "office/recruitment", "office/religion", "office/research", "office/security", "office/taxi", "office/telecommunication", "office/transport", "office/union", "office/university", "office/vacant", "office/web_design", "office/yes", ], ), # ── Other ──────────────────────────────────────────────── ( "Other", "EV Charging", "🔌", [ "amenity/charging_station", ], ), ( "Other", "Fuel Station", "⛽", [ "amenity/fuel", ], ), ( "Other", "Community Centre", "🤝", [ "amenity/church_hall", "amenity/clubhouse", "amenity/community_centre", "amenity/community_hall", "amenity/scout_hall", "amenity/social_centre", "amenity/townhall", ], ), ] # Build flat lookup: OSM category → (group, friendly_name, emoji) CATEGORY_MAP: dict[str, tuple[str, str, str]] = { osm_key: (group, name, emoji) for group, name, emoji, osm_keys in _CATEGORIES for osm_key in osm_keys } NAPTAN_EMOJIS: dict[str, str] = { "Airport": "✈️", "Ferry": "⛴️", "Rail station": "🚆", "Bus stop": "🚏", "Bus station": "🚌", "Taxi rank": "🚕", "Tube station": "🚇", } COOP_RETAILERS = { "Allendale Co-operative Society", "Central England Co-operative", "Channel Islands Co-operative Society", "Chelmsford Star Co-operative Society", "Clydebank Co-operative", "Coniston Co-operative Society", "East of England Co-operative", "Heart of England Co-operative", "Langdale Co-operative Society", "Lincolnshire Co-operative", "Midcounties Co-operative", "Scottish Midland Co-operative", "Tamworth Co-operative Society", "The Co-operative Group", "The Radstock Co-operative Society", "The Southern Co-operative", } MIN_GROCERY_CHAIN_LOCATIONS = 5 GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = { "Cook": "COOK", "Heron": "Heron Foods", "Marks and Spencer": "M&S", "Sainsburys": "Sainsbury's", } GROCERY_FASCIA_ICON_NAMES: dict[str, str] = { "Aldi": "Aldi", "Aldi Local": "Aldi", "Asda": "Asda", "Asda Express": "Asda Express", "Asda Living": "Asda Living", "Asda PFS": "Asda", "Asda Supercentre": "Asda Supercentre", "Asda Supermarket": "Asda Supermarket", "Asda Superstore": "Asda Superstore", "Booths": "Booths", "Budgens": "Budgens", "Centra": "Centra", "Cooltrader": "Heron Foods", "Co-op Food": "Co-op", "Cook": "COOK", "Costco": "Costco", "Dunnes Stores": "Dunnes Stores", "Eurospar": "Spar", "Eurospar PFS": "Spar", "Farmfoods": "Farmfoods", "Heron": "Heron Foods", "Iceland": "Iceland", "Lidl": "Lidl", "Little Waitrose": "Little Waitrose", "Little Waitrose Shell": "Little Waitrose", "Makro": "Makro", "Marks and Spencer": "M&S", "Marks and Spencer BP": "M&S Food", "Marks and Spencer Clothing": "M&S Clothing", "Marks and Spencer Food To Go": "M&S Food", "Marks and Spencer Food Outlet": "M&S Outlet", "Marks and Spencer Foodhall": "M&S Food", "Marks and Spencer Hospital": "M&S Hospital", "Marks and Spencer MSA": "M&S MSA", "Marks and Spencer Outlet": "M&S Outlet", "Marks and Spencer Simply Food": "M&S Food", "Marks and Spencer Travel SF": "M&S Food", "Morrisons Daily": "Morrisons Daily", "Morrisons Select": "Morrisons", "Planet Organic": "Planet Organic", "Sainsbury's Local": "Sainsbury's Local", "Sainsburys": "Sainsbury's", "Sainsburys Local": "Sainsbury's Local", "Spar": "Spar", "Spar PFS": "Spar", "Tesco": "Tesco", "Tesco Express": "Tesco Express", "Tesco Express Esso": "Tesco Express", "Tesco Extra": "Tesco Extra", "The Co-operative Food": "Co-op", "The Co-operative Food PFS": "Co-op", "The Food Warehouse": "The Food Warehouse", "Waitrose": "Waitrose", "Waitrose MSA": "Waitrose", "Whole Foods Market": "Whole Foods Market", } def normalize_grocery_retailer(retailer: str | None) -> str: if retailer is None: return "" retailer = retailer.strip() if retailer in COOP_RETAILERS: return "Co-op" return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer) def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str: if fascia: icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip()) if icon_name is not None: return icon_name return normalize_grocery_retailer(retailer) def transform_grocery_retail_points( grocery_df: pl.DataFrame, boundary_path: Path | None = None, min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS, ) -> pl.DataFrame: """Convert GEOLYTIX Grocery Retail Points into the POI parquet schema.""" required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"} missing = required - set(grocery_df.columns) if missing: raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}") df = ( grocery_df.select( pl.col("id").cast(pl.String), pl.col("retailer").cast(pl.String), pl.col("fascia").cast(pl.String), pl.col("store_name").cast(pl.String), pl.col("lat_wgs").cast(pl.Float64).alias("lat"), pl.col("long_wgs").cast(pl.Float64).alias("lng"), ) .with_columns( pl.col("retailer").str.strip_chars(), pl.col("fascia").str.strip_chars(), pl.col("store_name").str.strip_chars(), ) .drop_nulls(["id", "retailer", "lat", "lng"]) .filter(pl.col("retailer").str.len_chars() > 0) ) if boundary_path is not None and len(df) > 0: mask = in_england_mask( boundary_path, df["lat"].to_numpy(), df["lng"].to_numpy(), ) df = df.filter(pl.Series(mask)) eligible_retailers = ( df.group_by("retailer") .len() .filter(pl.col("len") >= min_chain_locations) .select("retailer") ) df = df.join(eligible_retailers, on="retailer", how="semi") return df.with_columns( pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"), pl.coalesce(["store_name", "fascia", "retailer"]) .str.replace_all("''", "'") .alias("name"), pl.col("retailer") .map_elements(normalize_grocery_retailer, return_dtype=pl.String) .alias("category"), pl.struct(["fascia", "retailer"]) .map_elements( lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]), return_dtype=pl.String, ) .alias("icon_category"), pl.lit("Groceries").alias("group"), pl.lit("🛒").alias("emoji"), ).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji") SCHOOL_ICON_CATEGORIES: dict[str, str] = { "Nursery school": "🧸", "Primary school": "🎒", "Secondary school": "🏫", "All-through school": "🏫", "Sixth form": "📚", "Further education college": "📚", "University": "🎓", "Special school": "🤝", "School": "🏫", } def _school_icon_category_expr() -> pl.Expr: """Pick an icon category from GIAS phase/type_group/age_range. type_group wins for universities, FE colleges and special schools (which span multiple phases); otherwise phase determines the bucket. For independent and other non-statutory schools where GIAS leaves phase null, fall back to the age_range bounds so they still split into the right pill.""" # GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed # primary") so we normalise before matching. phase = pl.col("phase").str.to_lowercase() # age_range is "" using an em-dash; both ends may be missing. age_parts = pl.col("age_range").str.split_exact("–", 1) min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False) max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False) return ( pl.when(pl.col("type_group") == "Universities") .then(pl.lit("University")) .when(pl.col("type_group") == "Special schools") .then(pl.lit("Special school")) .when(pl.col("type_group") == "Colleges") .then(pl.lit("Further education college")) .when(phase == "nursery") .then(pl.lit("Nursery school")) .when(phase.is_in(["primary", "middle deemed primary"])) .then(pl.lit("Primary school")) .when(phase.is_in(["secondary", "middle deemed secondary"])) .then(pl.lit("Secondary school")) .when(phase == "all-through") .then(pl.lit("All-through school")) .when(phase.is_in(["16 plus", "sixth form"])) .then(pl.lit("Sixth form")) # Age-range fallback for null-phase rows (≈3k Independents + Academies # GIAS doesn't classify by phase). .when(max_age <= 5) .then(pl.lit("Nursery school")) .when(min_age >= 16) .then(pl.lit("Sixth form")) .when((min_age <= 6) & (max_age >= 16)) .then(pl.lit("All-through school")) .when(max_age <= 11) .then(pl.lit("Primary school")) .when(min_age >= 10) .then(pl.lit("Secondary school")) .otherwise(pl.lit("School")) ) OFSTED_OEIF_LABELS = { "1": "Outstanding", "2": "Good", "3": "Requires improvement", "4": "Inadequate", } def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame: """Project the latest OEIF effectiveness grade to a human-readable label, keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to the conventional Ofsted labels; "Not judged" (post-2025 reform schools that only have a report card) is preserved verbatim; null grades drop out.""" grade_col = pl.col("Latest OEIF overall effectiveness") label = ( pl.when(grade_col == "1") .then(pl.lit(OFSTED_OEIF_LABELS["1"])) .when(grade_col == "2") .then(pl.lit(OFSTED_OEIF_LABELS["2"])) .when(grade_col == "3") .then(pl.lit(OFSTED_OEIF_LABELS["3"])) .when(grade_col == "4") .then(pl.lit(OFSTED_OEIF_LABELS["4"])) .when(grade_col == "Not judged") .then(pl.lit("Not judged")) .otherwise(None) ) return ( pl.scan_parquet(ofsted_path) .select( pl.col("URN").cast(pl.Int64).alias("urn"), label.alias("ofsted_rating"), ) .filter(pl.col("ofsted_rating").is_not_null()) ) def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame: """Convert the GIAS register parquet into POI rows with school metadata. Ofsted ratings are joined by URN so each school carries its latest OEIF overall effectiveness grade (Outstanding/Good/Requires improvement/ Inadequate/Not judged), surfaced in the map popup.""" icon_category_expr = _school_icon_category_expr() emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES) ofsted = _load_ofsted_ratings(ofsted_path) # category mirrors icon_category so the dashboard renders one toggle per # school type (Nursery / Primary / Secondary / Sixth form / University /…) # instead of bundling every GIAS row under a single "School" pill. return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select( pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"), pl.col("name"), icon_category_expr.alias("category"), icon_category_expr.alias("icon_category"), pl.lit("Education").alias("group"), pl.col("lat").cast(pl.Float64), pl.col("lng").cast(pl.Float64), emoji_expr.alias("emoji"), pl.col("phase").alias("school_phase"), pl.col("type").alias("school_type"), pl.col("type_group").alias("school_type_group"), pl.col("age_range").alias("school_age_range"), pl.col("gender").alias("school_gender"), pl.col("religious_character").alias("school_religious_character"), pl.col("admissions_policy").alias("school_admissions_policy"), pl.col("nursery_provision").alias("school_nursery_provision"), pl.col("sixth_form").alias("school_sixth_form"), pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"), pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"), pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"), pl.col("trust").alias("school_trust"), pl.col("address").alias("school_address"), pl.col("postcode").alias("school_postcode"), pl.col("local_authority").alias("school_local_authority"), pl.col("website").alias("school_website"), pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"), pl.col("head_name").alias("school_head_name"), pl.col("ofsted_rating").alias("school_ofsted_rating"), ) def transform( input_path: Path, naptan_path: Path, boundary_path: Path, grocery_retail_points_path: Path, gias_path: Path, ofsted_path: Path, ) -> pl.LazyFrame: lf = pl.scan_parquet(input_path) # Get all unique categories present in the data all_categories = ( lf.select("category").unique().collect(engine="streaming").to_series().to_list() ) # Verify every non-dropped category has a mapping unmapped = [] for cat in all_categories: if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP: unmapped.append(cat) if unmapped: raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}") # Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts) mapped_but_absent = [] all_set = set(all_categories) for cat in CATEGORY_MAP: if cat not in all_set: mapped_but_absent.append(cat) if mapped_but_absent: print( f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}" ) # Drop unwanted categories lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES))) # Build lookup expressions from the 3-tuple mapping group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()} name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()} emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()} # Check no friendly names or emojis are empty (defensive) missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]] if missing_names: raise ValueError(f"Empty friendly names for: {missing_names}") missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]] if missing_emojis: raise ValueError(f"Empty emojis for: {missing_emojis}") lf = lf.with_columns( pl.col("category").replace_strict(group_mapping).alias("group"), pl.col("category").replace_strict(name_mapping).alias("category"), pl.col("category").replace_strict(name_mapping).alias("icon_category"), pl.col("category").replace_strict(emoji_mapping).alias("emoji"), ) naptan_df = pl.scan_parquet(naptan_path).collect() mask = in_england_mask( boundary_path, naptan_df["lat"].to_numpy(), naptan_df["lng"].to_numpy(), ) naptan_df = naptan_df.filter(pl.Series(mask)) naptan = naptan_df.lazy().with_columns( pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"), pl.lit("Public Transport").alias("group"), pl.col("category").alias("icon_category"), ) grocery_df = pl.read_parquet(grocery_retail_points_path) grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path) frames = [ lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path, ofsted_path), ] return pl.concat(frames, how="diagonal_relaxed") def main(): parser = argparse.ArgumentParser( description="Transform raw POIs to filtered version with friendly names" ) parser.add_argument( "--input", type=Path, required=True, help="Raw POIs parquet file" ) parser.add_argument( "--naptan", type=Path, required=True, help="NaPTAN stations parquet file" ) parser.add_argument( "--boundary", type=Path, required=True, help="England boundary GeoJSON file", ) parser.add_argument( "--grocery-retail-points", type=Path, required=True, help="GEOLYTIX Grocery Retail Points parquet", ) parser.add_argument( "--gias", type=Path, required=True, help="GIAS schools register parquet (replaces OSM schools)", ) parser.add_argument( "--ofsted", type=Path, required=True, help="Ofsted latest-inspections parquet (provides per-URN ratings)", ) parser.add_argument( "--output", type=Path, required=True, help="Output filtered POIs parquet file" ) args = parser.parse_args() df = transform( args.input, args.naptan, args.boundary, args.grocery_retail_points, args.gias, args.ofsted, ).collect(engine="streaming") df.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)") print(f"\nCategories ({df['category'].n_unique()}):") counts = ( df.group_by("group", "category", "emoji").len().sort("len", descending=True) ) for row in counts.iter_rows(named=True): print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}") if __name__ == "__main__": main()