import argparse from pathlib import Path import polars as pl from pipeline.utils.england_geometry import in_england_mask DROP_CATEGORIES = { # Street furniture & infrastructure "amenity/advice", "amenity/atm", "amenity/bbq", "amenity/bench", "amenity/bicycle_parking", "amenity/binoculars", "amenity/boot_scraper", "amenity/check_in", "amenity/clock", "amenity/compressed_air", "amenity/donation_box", "amenity/dressing_room", "amenity/drinking_water", "shop/taxi", "amenity/feeding_place", "amenity/fixme", "amenity/grit_bin", "amenity/hunting_stand", "amenity/letter_box", "amenity/loading_dock", "amenity/lounge", "tourism/preserved_railway", "amenity/lounger", "amenity/motorcycle_parking", "amenity/mounting_block", "amenity/notice_board", "amenity/parcel_locker", "amenity/parking", "amenity/parking_entrance", "amenity/parking_space", "amenity/payment_terminal", "amenity/photo_booth", "amenity/piano", "amenity/post_box", "amenity/public_bookcase", "amenity/reception_desk", "amenity/sanitary_dump_station", "amenity/shelter", "amenity/shower", "amenity/smoking_area", "amenity/table", "amenity/telephone", "amenity/telescope", "amenity/ticket_validator", "amenity/toilets", "amenity/trolley_bay", "amenity/vacuum_cleaner", "amenity/vending_machine", "amenity/washing_machine", "amenity/washingline", "amenity/waste_basket", "amenity/waste_disposal", "amenity/waste_transfer_station", "amenity/water_point", "amenity/watering_place", "amenity/weighbridge", # Niche amenities not useful for home buyers "amenity/animal_boarding", "amenity/animal_breeding", "amenity/animal_shelter", "amenity/boat_storage", "amenity/bureau_de_change", "amenity/bus_station", "amenity/conference_centre", "amenity/crematorium", "amenity/driving_school", "amenity/escooter_rental", "amenity/ferry_terminal", "amenity/grave_yard", "amenity/hall", "shop/funeral_directors", "amenity/kick-scooter_rental", "amenity/money_transfer", "amenity/post_depot", "amenity/public_building", "amenity/recycling", "amenity/scout_hut", "amenity/social_facility", "amenity/studio", "amenity/taxi", "amenity/training", "amenity/vehicle_inspection", # Buildings (except church & university which are mapped) "building/air_shaft", "building/apartments", "building/barn", "building/bunker", "building/chapel", "building/commercial", "building/construction", "building/detached", "building/entrance", "building/entry", "building/farm", "building/farm_auxiliary", "building/garage", "building/garages", "building/greenhouse", "building/house", "building/hut", "building/industrial", "building/kiosk", "building/no", "building/office", "building/public", "building/residential", "building/retail", "building/roof", "building/ruins", "building/school", "building/semidetached_house", "building/service", "building/shed", "building/terrace", "building/warehouse", "building/yes", # All emergency "emergency/access_point", "emergency/assembly_point", "emergency/bleed_control_kit", "emergency/defibrillator", "emergency/designated", "emergency/dry_riser_inlet", "emergency/emergency_ward_entrance", "emergency/fire_alarm_box", "emergency/fire_extinguisher", "emergency/fire_hydrant", "emergency/fire_service_inlet", "emergency/first_aid_kit", "emergency/life_ring", "emergency/lifeguard", "emergency/no", "emergency/phone", "emergency/rescue_equipment", "emergency/siren", "emergency/throw_bag", "emergency/water_rescue", "emergency/yes", "tourism/apartment", "tourism/apartments", "tourism/camp_pitch", "tourism/caravan_site", "tourism/information", "tourism/picnic_site", "tourism/viewpoint", "tourism/village_sign", "tourism/yes", # Public transport (from NaPTAN instead) "public_transport/entrance", "public_transport/platform", "public_transport/station", "public_transport/stop_position", } # Each output category defined once: (group, friendly_name, emoji, [osm_keys...]) # The flat CATEGORY_MAP lookup dict is built from this at the bottom. _CATEGORIES: list[tuple[str, str, str, list[str]]] = [ ( "Leisure", "Café", "☕", [ "amenity/cafe", "amenity/ice_cream", "amenity/internet_cafe", ], ), ( "Leisure", "Restaurant", "🍽️", [ "amenity/restaurant", "amenity/food_court", ], ), ( "Leisure", "Pub", "🍺", [ "amenity/pub", "amenity/social_club", "amenity/club", "leisure/social_club", "craft/brewery", "craft/distillery", "craft/winery", ], ), ( "Leisure", "Bar", "🍸", [ "amenity/bar", "amenity/hookah_lounge", ], ), ( "Leisure", "Fast Food", "🍔", [ "amenity/fast_food", ], ), ( "Leisure", "Nightclub", "🪩", [ "amenity/nightclub", "amenity/stripclub", "amenity/casino", "amenity/gambling", ], ), ( "Leisure", "Cinema", "🎬", [ "amenity/cinema", ], ), ( "Leisure", "Theatre", "🎭", [ "amenity/theatre", ], ), ( "Leisure", "Live Music & Events", "🎶", [ "amenity/music_venue", "amenity/events_venue", "leisure/dance", ], ), ( "Leisure", "Park", "🌳", [ "leisure/park", "leisure/garden", "leisure/common", "leisure/nature_reserve", "leisure/dog_park", "leisure/bandstand", "leisure/bird_hide", "leisure/firepit", "leisure/outdoor_seating", "leisure/picnic_table", "leisure/wildlife_hide", ], ), ( "Leisure", "Playground", "🛝", [ "leisure/playground", "leisure/indoor_play", ], ), ( "Leisure", "Sports Centre", "🏟️", [ "leisure/sports_centre", "leisure/sports_hall", "leisure/pitch", "leisure/track", "leisure/golf_course", "leisure/miniature_golf", "leisure/horse_riding", "leisure/fishing", "leisure/swimming_pool", "leisure/water_park", "leisure/bathing_place", ], ), ( "Leisure", "Entertainment", "🎳", [ "leisure/bowling_alley", "leisure/amusement_arcade", "leisure/adult_gaming_centre", "leisure/escape_game", "leisure/trampoline_park", "leisure/sauna", "leisure/tanning_salon", "tourism/theme_park", "amenity/bicycle_rental", "amenity/boat_rental", "leisure/marina", "leisure/slipway", "leisure/hackerspace", "leisure/yes", ], ), ( "Groceries", "Supermarket", "🛒", [ "shop/supermarket", ], ), ( "Groceries", "Convenience Store", "🏪", [ "shop/convenience", "shop/general", "shop/kiosk", "shop/grocery", ], ), ( "Groceries", "Bakery", "🥐", [ "shop/bakery", "shop/pastry", "craft/confectionery", ], ), ( "Groceries", "Butcher & Fishmonger", "🥩", [ "shop/butcher", "shop/seafood", ], ), ( "Groceries", "Greengrocer", "🥬", [ "shop/greengrocer", "shop/farm", "amenity/marketplace", ], ), ( "Groceries", "Off-Licence", "🍷", [ "shop/alcohol", "shop/wine", "shop/beverages", ], ), ( "Groceries", "Deli & Specialty", "🧆", [ "shop/deli", "shop/cheese", "shop/chocolate", "shop/coffee", "shop/confectionery", "shop/dairy", "shop/food", "shop/frozen_food", "shop/health_food", "shop/ice_cream", "shop/nutrition_supplements", "shop/tea", ], ), ( "Shops", "Fashion & Clothing", "👕", [ "shop/clothes", "shop/boutique", "shop/shoes", "shop/accessories", "shop/bag", "shop/fashion_accessories", "shop/jewelry", "shop/leather", "shop/watches", ], ), ( "Shops", "Electronics", "📱", [ "shop/electronics", "shop/mobile_phone", "shop/mobile_phone_accessories", "shop/computer", "shop/appliance", "shop/electrical", "shop/hifi", "shop/video_games", "shop/games", ], ), ( "Shops", "Charity Shop", "❤️", [ "shop/charity", "shop/second_hand", ], ), ( "Shops", "DIY & Hardware", "🔨", [ "shop/doityourself", "shop/hardware", "shop/paint", ], ), ( "Shops", "Home & Garden", "🪑", [ "shop/furniture", "shop/garden_centre", "shop/kitchen", "shop/bathroom", "shop/bathroom_furnishing", "shop/bed", "shop/carpet", "shop/curtain", "shop/flooring", "shop/fireplace", "shop/household", "shop/household_linen", "shop/houseware", "shop/interior_decoration", "shop/lighting", "shop/window_blind", ], ), ( "Shops", "Bookshop", "📚", [ "shop/books", "shop/stationery", ], ), ( "Shops", "Pet Shop", "🐾", [ "shop/pet", ], ), ( "Shops", "Sports & Outdoor", "🏕️", [ "shop/sports", "shop/outdoor", "shop/bicycle", ], ), ( "Shops", "Newsagent", "📰", [ "shop/newsagent", "shop/tobacco", ], ), ( "Shops", "Department Store", "🏬", [ "shop/department_store", "shop/mall", "shop/variety_store", "shop/discount", ], ), ( "Shops", "Gift & Hobby", "🎁", [ "shop/gift", "shop/florist", "shop/toys", "shop/craft", "shop/candles", "shop/party", "shop/art", "shop/music", "shop/musical_instrument", "shop/antiques", "shop/baby_goods", "shop/fabric", "shop/haberdashery", "shop/wool", "shop/pottery", ], ), ( "Shops", "Specialist Shop", "🏪", [ "shop/agrarian", "shop/boat", "shop/bookmaker", "shop/building_materials", "shop/camera", "shop/car", "shop/caravan", "shop/catalogue", "shop/collector", "shop/copyshop", "shop/country_store", "shop/doors", "shop/e-cigarette", "shop/erotic", "shop/esoteric", "shop/fan", "shop/fishing", "shop/frame", "shop/fuel", "shop/gas", "shop/hairdresser_supply", "shop/military_surplus", "shop/model", "shop/money_lender", "shop/motorcycle", "shop/outpost", "shop/pawnbroker", "shop/photo", "shop/plant_hire", "shop/printer_ink", "shop/printing", "shop/psychic", "shop/pyrotechnics", "shop/religion", "shop/rental", "shop/scuba_diving", "shop/security", "shop/sewing", "shop/storage_rental", "shop/swimming_pool", "shop/telecommunication", "shop/ticket", "shop/tiles", "shop/tool_hire", "shop/trade", "shop/trophy", "shop/vacant", "shop/video", "shop/water_sports", "shop/weapons", "shop/wedding", "shop/wholesale", "shop/wigs", "shop/yes", ], ), # ── Services ───────────────────────────────────────────── ( "Services", "Hairdresser & Beauty", "💇", [ "shop/hairdresser", "shop/beauty", "shop/cosmetics", "shop/massage", "shop/perfumery", ], ), ( "Services", "Gym & Fitness", "🏋️", [ "leisure/fitness_centre", "leisure/fitness_station", "amenity/dojo", "amenity/dancing_school", ], ), ( "Services", "Dry Cleaner & Laundry", "👔", [ "shop/dry_cleaning", "shop/laundry", "shop/tailor", "shop/shoe_repair", "shop/repair", "craft/cleaning", "craft/dressmaker", "craft/shoemaker", "craft/tailor", ], ), ( "Services", "Car Services", "🔧", [ "shop/car_repair", "shop/car;car_repair", "shop/car_parts", "shop/motorcycle_repair", "shop/tyres", "amenity/car_wash", "amenity/car_rental", "amenity/car_sharing", "amenity/bicycle_repair_station", ], ), ( "Services", "Post Office", "🏤", [ "amenity/post_office", ], ), ( "Services", "Vet & Pet Care", "🐕", [ "amenity/veterinary", "shop/pet_grooming", ], ), ( "Services", "Bank", "🏦", [ "amenity/bank", ], ), ( "Services", "Travel Agent", "✈️", [ "shop/travel_agency", "office/travel_agent", ], ), ( "Services", "Other", "🛎️", [ "shop/tattoo", "shop/piercing", "shop/locksmith", "craft/key_cutter", ], ), ( "Emergency Services", "Police", "👮", ["amenity/police"], ), ( "Emergency Services", "Fire Station", "🚒", ["amenity/fire_station"], ), ( "Emergency Services", "Ambulance Station", "🚑", ["emergency/ambulance_station"], ), ( "Health", "GP Surgery", "👨‍⚕️", [ "amenity/doctors", "healthcare/doctor", ], ), ( "Health", "Dentist", "🦷", [ "amenity/dentist", "healthcare/dentist", ], ), ( "Health", "Pharmacy", "💊", [ "amenity/pharmacy", "healthcare/pharmacy", "shop/chemist", "shop/herbalist", "shop/health", "healthcare/alternative", ], ), ( "Health", "Hospital & Clinic", "🏥", [ "amenity/hospital", "amenity/clinic", "healthcare/hospital", "healthcare/centre", "healthcare/clinic", "office/healthcare", "healthcare/laboratory", "healthcare/rehabilitation", "healthcare/vaccination_centre", "healthcare/yes", ], ), ( "Health", "Optician", "👓", [ "shop/optician", "healthcare/optometrist", "shop/hearing_aids", "healthcare/audiologist", ], ), ( "Health", "Physiotherapy", "🏃", [ "healthcare/physiotherapist", "healthcare/podiatrist", ], ), ( "Health", "Counselling & Therapy", "🧠", [ "healthcare/counselling", "healthcare/psychotherapist", "office/therapist", ], ), ( "Health", "Care Home", "🏠", [ "amenity/care_home", "amenity/nursing_home", "office/home_care", ], ), ( "Health", "Medical & Mobility", "♿", [ "shop/medical_supply", "shop/mobility", "shop/mobility_scooter", ], ), ( "Culture", "Museum", "🏛️", [ "tourism/museum", ], ), ( "Culture", "Gallery", "🖼️", [ "tourism/gallery", "tourism/artwork", ], ), ( "Culture", "Library", "📚", [ "amenity/library", ], ), ( "Culture", "Place of Worship", "⛪", [ "amenity/place_of_worship", "building/church", ], ), ( "Culture", "Arts Centre", "🎨", [ "amenity/arts_centre", ], ), ( "Culture", "Zoo", "🦁", [ "tourism/zoo", ], ), ( "Culture", "Tourist Attraction", "📸", [ "tourism/attraction", "amenity/fountain", "amenity/courthouse", "tourism/chalet", ], ), ( "Education", "School", "🏫", [ "amenity/school", "amenity/prep_school", "amenity/language_school", "amenity/music_school", "amenity/university", "amenity/college", "building/university", "amenity/kindergarten", "amenity/childcare", ], ), ( "Local Businesses", "Hotel", "🏨", [ "tourism/hotel", "tourism/hostel", "tourism/guest_house", "tourism/motel", "tourism/camp_site", ], ), ( "Local Businesses", "Local Business", "🛠️", [ # Tradespeople "craft/builder", "craft/carpenter", "craft/electrician", "craft/electronics_repair", "craft/floorer", "craft/gardener", "craft/glaziery", "craft/hvac", "craft/joiner", "craft/locksmith", "craft/painter", "craft/plumber", "craft/roofer", "craft/window_construction", "craft/agricultural_engines", "craft/atelier", "craft/blacksmith", "craft/bookbinder", "craft/caterer", "craft/handicraft", "craft/jeweller", "craft/metal_construction", "craft/photographer", "craft/photographic_laboratory", "craft/pottery", "craft/printer", "craft/sawmill", "craft/scaffolder", "craft/sculptor", "craft/signmaker", "craft/stonemason", "craft/upholsterer", "craft/watchmaker", "craft/yes", "shop/glaziery", "shop/windows", # Professional offices & estate agents "shop/estate_agent", "office/accountant", "office/architect", "office/construction_company", "office/engineer", "office/estate_agent", "office/financial", "office/financial_advisor", "office/insurance", "office/lawyer", "office/mortgage", "office/property_management", "office/solicitor", "office/surveyor", "office/tax_advisor", ], ), ( "Local Businesses", "Offices", "🏢", [ "amenity/coworking_space", "office/advertising_agency", "office/association", "office/charity", "office/company", "office/consulting", "office/courier", "office/coworking", "office/design", "office/diplomatic", "office/educational_institution", "office/employment_agency", "office/energy_supplier", "office/foundation", "office/government", "office/graphic_design", "office/interior_design", "office/it", "office/logistics", "office/marketing", "office/moving_company", "office/newspaper", "office/ngo", "office/notary", "office/political_party", "office/politician", "office/recruitment", "office/religion", "office/research", "office/security", "office/taxi", "office/telecommunication", "office/union", "office/university", "office/vacant", "office/web_design", "office/yes", ], ), # ── Other ──────────────────────────────────────────────── ( "Other", "EV Charging", "🔌", [ "amenity/charging_station", ], ), ( "Other", "Fuel Station", "⛽", [ "amenity/fuel", ], ), ( "Other", "Community Centre", "🤝", [ "amenity/community_centre", "amenity/social_centre", "amenity/townhall", ], ), ] # Build flat lookup: OSM category → (group, friendly_name, emoji) CATEGORY_MAP: dict[str, tuple[str, str, str]] = { osm_key: (group, name, emoji) for group, name, emoji, osm_keys in _CATEGORIES for osm_key in osm_keys } NAPTAN_EMOJIS: dict[str, str] = { "Airport": "✈️", "Ferry": "⛴️", "Rail station": "🚆", "Bus stop": "🚏", "Bus station": "🚌", "Taxi rank": "🚕", "Tube station": "🚇", } COOP_RETAILERS = { "Allendale Co-operative Society", "Central England Co-operative", "Channel Islands Co-operative Society", "Chelmsford Star Co-operative Society", "Clydebank Co-operative", "Coniston Co-operative Society", "East of England Co-operative", "Heart of England Co-operative", "Langdale Co-operative Society", "Lincolnshire Co-operative", "Midcounties Co-operative", "Scottish Midland Co-operative", "Tamworth Co-operative Society", "The Co-operative Group", "The Radstock Co-operative Society", "The Southern Co-operative", } GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = { "Cook": "COOK", "Heron": "Heron Foods", "Marks and Spencer": "M&S", "Sainsburys": "Sainsbury's", **{retailer: "Co-op" for retailer in COOP_RETAILERS}, } GROCERY_FASCIA_ICON_NAMES: dict[str, str] = { "Aldi Local": "Aldi", "Asda Express": "Asda Express", "Asda Living": "Asda Living", "Asda PFS": "Asda PFS", "Cooltrader": "Heron Foods", "Co-op Food": "Co-op", "Cook": "COOK", "Eurospar": "Spar", "Eurospar PFS": "Spar", "Heron": "Heron Foods", "Little Waitrose": "Little Waitrose", "Little Waitrose Shell": "Little Waitrose", "Marks and Spencer": "M&S", "Marks and Spencer BP": "M&S Food", "Marks and Spencer Clothing": "M&S Clothing", "Marks and Spencer Food To Go": "M&S Food", "Marks and Spencer Food Outlet": "M&S Outlet", "Marks and Spencer Foodhall": "M&S Food", "Marks and Spencer Hospital": "M&S Hospital", "Marks and Spencer MSA": "M&S MSA", "Marks and Spencer Outlet": "M&S Outlet", "Marks and Spencer Simply Food": "M&S Food", "Marks and Spencer Travel SF": "M&S Food", "Morrisons Daily": "Morrisons Daily", "Morrisons Select": "Morrisons", "Sainsburys": "Sainsbury's", "Sainsburys Local": "Sainsbury's Local", "Spar PFS": "Spar", "Tesco Express": "Tesco Express", "Tesco Express Esso": "Tesco Express", "Tesco Extra": "Tesco Extra", "The Co-operative Food": "Co-op", "The Co-operative Food PFS": "Co-op", "The Food Warehouse": "The Food Warehouse", "Waitrose MSA": "Waitrose", } def normalize_grocery_retailer(retailer: str | None) -> str: if retailer is None: return "" display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer) if display_name is None: raise ValueError(f"Missing grocery retailer display name for {retailer!r}") return display_name def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str: if fascia: icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia) if icon_name is None: raise ValueError(f"Missing grocery fascia icon name for {fascia!r}") return icon_name return normalize_grocery_retailer(retailer) def transform_grocery_retail_points( grocery_df: pl.DataFrame, boundary_path: Path | None = None, ) -> pl.DataFrame: """Convert GEOLYTIX Grocery Retail Points into the POI parquet schema.""" required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"} missing = required - set(grocery_df.columns) if missing: raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}") df = ( grocery_df.select( pl.col("id").cast(pl.String), pl.col("retailer").cast(pl.String), pl.col("fascia").cast(pl.String), pl.col("store_name").cast(pl.String), pl.col("lat_wgs").cast(pl.Float64).alias("lat"), pl.col("long_wgs").cast(pl.Float64).alias("lng"), ) .drop_nulls(["id", "retailer", "lat", "lng"]) .filter(pl.col("retailer").str.len_chars() > 0) ) if boundary_path is not None and len(df) > 0: mask = in_england_mask( boundary_path, df["lat"].to_numpy(), df["lng"].to_numpy(), ) df = df.filter(pl.Series(mask)) return df.with_columns( pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"), pl.coalesce(["store_name", "fascia", "retailer"]) .str.replace_all("''", "'") .alias("name"), pl.col("retailer") .map_elements(normalize_grocery_retailer, return_dtype=pl.String) .alias("category"), pl.struct(["fascia", "retailer"]) .map_elements( lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]), return_dtype=pl.String, ) .alias("icon_category"), pl.lit("Groceries").alias("group"), pl.lit("🛒").alias("emoji"), ).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji") def transform( input_path: Path, naptan_path: Path | None = None, boundary_path: Path | None = None, grocery_retail_points_path: Path | None = None, ) -> pl.LazyFrame: lf = pl.scan_parquet(input_path) # Get all unique categories present in the data all_categories = ( lf.select("category").unique().collect(engine="streaming").to_series().to_list() ) # Verify every non-dropped category has a mapping unmapped = [] for cat in all_categories: if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP: unmapped.append(cat) if unmapped: raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}") # Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts) mapped_but_absent = [] all_set = set(all_categories) for cat in CATEGORY_MAP: if cat not in all_set: mapped_but_absent.append(cat) if mapped_but_absent: print( f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}" ) # Drop unwanted categories lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES))) # Build lookup expressions from the 3-tuple mapping group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()} name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()} emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()} # Check no friendly names or emojis are empty (defensive) missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]] if missing_names: raise ValueError(f"Empty friendly names for: {missing_names}") missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]] if missing_emojis: raise ValueError(f"Empty emojis for: {missing_emojis}") lf = lf.with_columns( pl.col("category").replace_strict(group_mapping).alias("group"), pl.col("category").replace_strict(name_mapping).alias("category"), pl.col("category").replace_strict(name_mapping).alias("icon_category"), pl.col("category").replace_strict(emoji_mapping).alias("emoji"), ) naptan_df = pl.scan_parquet(naptan_path).collect() if boundary_path is not None: mask = in_england_mask( boundary_path, naptan_df["lat"].to_numpy(), naptan_df["lng"].to_numpy(), ) naptan_df = naptan_df.filter(pl.Series(mask)) naptan = naptan_df.lazy().with_columns( pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"), pl.lit("Public Transport").alias("group"), pl.col("category").alias("icon_category"), ) frames = [lf, naptan] if grocery_retail_points_path is not None: grocery_df = pl.read_parquet(grocery_retail_points_path) grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path) frames.append(grocery_pois.lazy()) return pl.concat(frames, how="diagonal_relaxed") def main(): parser = argparse.ArgumentParser( description="Transform raw POIs to filtered version with friendly names" ) parser.add_argument( "--input", type=Path, required=True, help="Raw POIs parquet file" ) parser.add_argument( "--naptan", type=Path, required=True, help="NaPTAN stations parquet file" ) parser.add_argument( "--boundary", type=Path, required=True, help="England boundary GeoJSON file", ) parser.add_argument( "--grocery-retail-points", type=Path, help="GEOLYTIX Grocery Retail Points parquet", ) parser.add_argument( "--output", type=Path, required=True, help="Output filtered POIs parquet file" ) args = parser.parse_args() df = transform( args.input, args.naptan, args.boundary, args.grocery_retail_points, ).collect(engine="streaming") df.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)") print(f"\nCategories ({df['category'].n_unique()}):") counts = ( df.group_by("group", "category", "emoji").len().sort("len", descending=True) ) for row in counts.iter_rows(named=True): print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}") if __name__ == "__main__": main()