perfect-postcode/pipeline/transform/transform_poi.py
2026-03-15 14:03:38 +00:00

1163 lines
28 KiB
Python

import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
"amenity/bbq",
"amenity/bench",
"amenity/bicycle_parking",
"amenity/binoculars",
"amenity/boot_scraper",
"amenity/check_in",
"amenity/clock",
"amenity/compressed_air",
"amenity/donation_box",
"amenity/dressing_room",
"amenity/drinking_water",
"shop/taxi",
"amenity/feeding_place",
"amenity/fixme",
"amenity/grit_bin",
"amenity/hunting_stand",
"amenity/letter_box",
"amenity/loading_dock",
"amenity/lounge",
"tourism/preserved_railway",
"amenity/lounger",
"amenity/motorcycle_parking",
"amenity/mounting_block",
"amenity/notice_board",
"amenity/parcel_locker",
"amenity/parking",
"amenity/parking_entrance",
"amenity/parking_space",
"amenity/payment_terminal",
"amenity/photo_booth",
"amenity/piano",
"amenity/post_box",
"amenity/public_bookcase",
"amenity/reception_desk",
"amenity/sanitary_dump_station",
"amenity/shelter",
"amenity/shower",
"amenity/smoking_area",
"amenity/table",
"amenity/telephone",
"amenity/telescope",
"amenity/ticket_validator",
"amenity/toilets",
"amenity/trolley_bay",
"amenity/vacuum_cleaner",
"amenity/vending_machine",
"amenity/washing_machine",
"amenity/washingline",
"amenity/waste_basket",
"amenity/waste_disposal",
"amenity/waste_transfer_station",
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
"amenity/animal_shelter",
"amenity/boat_storage",
"amenity/bureau_de_change",
"amenity/bus_station",
"amenity/conference_centre",
"amenity/crematorium",
"amenity/driving_school",
"amenity/escooter_rental",
"amenity/ferry_terminal",
"amenity/grave_yard",
"amenity/hall",
"shop/funeral_directors",
"amenity/kick-scooter_rental",
"amenity/money_transfer",
"amenity/post_depot",
"amenity/public_building",
"amenity/recycling",
"amenity/scout_hut",
"amenity/social_facility",
"amenity/studio",
"amenity/taxi",
"amenity/training",
"amenity/vehicle_inspection",
# Buildings (except church & university which are mapped)
"building/air_shaft",
"building/apartments",
"building/barn",
"building/bunker",
"building/chapel",
"building/commercial",
"building/construction",
"building/detached",
"building/entrance",
"building/entry",
"building/farm",
"building/farm_auxiliary",
"building/garage",
"building/garages",
"building/greenhouse",
"building/house",
"building/hut",
"building/industrial",
"building/kiosk",
"building/no",
"building/office",
"building/public",
"building/residential",
"building/retail",
"building/roof",
"building/ruins",
"building/school",
"building/semidetached_house",
"building/service",
"building/shed",
"building/terrace",
"building/warehouse",
"building/yes",
# All emergency
"emergency/access_point",
"emergency/assembly_point",
"emergency/bleed_control_kit",
"emergency/defibrillator",
"emergency/designated",
"emergency/dry_riser_inlet",
"emergency/emergency_ward_entrance",
"emergency/fire_alarm_box",
"emergency/fire_extinguisher",
"emergency/fire_hydrant",
"emergency/fire_service_inlet",
"emergency/first_aid_kit",
"emergency/life_ring",
"emergency/lifeguard",
"emergency/no",
"emergency/phone",
"emergency/rescue_equipment",
"emergency/siren",
"emergency/throw_bag",
"emergency/water_rescue",
"emergency/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/camp_pitch",
"tourism/caravan_site",
"tourism/information",
"tourism/picnic_site",
"tourism/viewpoint",
"tourism/village_sign",
"tourism/yes",
# Public transport (from NaPTAN instead)
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
}
# Each output category defined once: (group, friendly_name, emoji, [osm_keys...])
# The flat CATEGORY_MAP lookup dict is built from this at the bottom.
_CATEGORIES: list[tuple[str, str, str, list[str]]] = [
(
"Leisure",
"Café",
"",
[
"amenity/cafe",
"amenity/ice_cream",
"amenity/internet_cafe",
],
),
(
"Leisure",
"Restaurant",
"🍽️",
[
"amenity/restaurant",
"amenity/food_court",
],
),
(
"Leisure",
"Pub",
"🍺",
[
"amenity/pub",
"amenity/social_club",
"amenity/club",
"leisure/social_club",
"craft/brewery",
"craft/distillery",
"craft/winery",
],
),
(
"Leisure",
"Bar",
"🍸",
[
"amenity/bar",
"amenity/hookah_lounge",
],
),
(
"Leisure",
"Fast Food",
"🍔",
[
"amenity/fast_food",
],
),
(
"Leisure",
"Nightclub",
"🪩",
[
"amenity/nightclub",
"amenity/stripclub",
"amenity/casino",
"amenity/gambling",
],
),
(
"Leisure",
"Cinema",
"🎬",
[
"amenity/cinema",
],
),
(
"Leisure",
"Theatre",
"🎭",
[
"amenity/theatre",
],
),
(
"Leisure",
"Live Music & Events",
"🎶",
[
"amenity/music_venue",
"amenity/events_venue",
"leisure/dance",
],
),
(
"Leisure",
"Park",
"🌳",
[
"leisure/park",
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
],
),
(
"Leisure",
"Playground",
"🛝",
[
"leisure/playground",
"leisure/indoor_play",
],
),
(
"Leisure",
"Sports Centre",
"🏟️",
[
"leisure/sports_centre",
"leisure/sports_hall",
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
"leisure/miniature_golf",
"leisure/horse_riding",
"leisure/fishing",
"leisure/swimming_pool",
"leisure/water_park",
"leisure/bathing_place",
],
),
(
"Leisure",
"Entertainment",
"🎳",
[
"leisure/bowling_alley",
"leisure/amusement_arcade",
"leisure/adult_gaming_centre",
"leisure/escape_game",
"leisure/trampoline_park",
"leisure/sauna",
"leisure/tanning_salon",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"leisure/hackerspace",
"leisure/yes",
],
),
(
"Groceries",
"Supermarket",
"🛒",
[
"shop/supermarket",
],
),
(
"Groceries",
"Convenience Store",
"🏪",
[
"shop/convenience",
"shop/general",
"shop/kiosk",
"shop/grocery",
],
),
(
"Groceries",
"Bakery",
"🥐",
[
"shop/bakery",
"shop/pastry",
"craft/confectionery",
],
),
(
"Groceries",
"Butcher & Fishmonger",
"🥩",
[
"shop/butcher",
"shop/seafood",
],
),
(
"Groceries",
"Greengrocer",
"🥬",
[
"shop/greengrocer",
"shop/farm",
"amenity/marketplace",
],
),
(
"Groceries",
"Off-Licence",
"🍷",
[
"shop/alcohol",
"shop/wine",
"shop/beverages",
],
),
(
"Groceries",
"Deli & Specialty",
"🧆",
[
"shop/deli",
"shop/cheese",
"shop/chocolate",
"shop/coffee",
"shop/confectionery",
"shop/dairy",
"shop/food",
"shop/frozen_food",
"shop/health_food",
"shop/ice_cream",
"shop/nutrition_supplements",
"shop/tea",
],
),
(
"Shops",
"Fashion & Clothing",
"👕",
[
"shop/clothes",
"shop/boutique",
"shop/shoes",
"shop/accessories",
"shop/bag",
"shop/fashion_accessories",
"shop/jewelry",
"shop/leather",
"shop/watches",
],
),
(
"Shops",
"Electronics",
"📱",
[
"shop/electronics",
"shop/mobile_phone",
"shop/mobile_phone_accessories",
"shop/computer",
"shop/appliance",
"shop/electrical",
"shop/hifi",
"shop/video_games",
"shop/games",
],
),
(
"Shops",
"Charity Shop",
"❤️",
[
"shop/charity",
"shop/second_hand",
],
),
(
"Shops",
"DIY & Hardware",
"🔨",
[
"shop/doityourself",
"shop/hardware",
"shop/paint",
],
),
(
"Shops",
"Home & Garden",
"🪑",
[
"shop/furniture",
"shop/garden_centre",
"shop/kitchen",
"shop/bathroom",
"shop/bathroom_furnishing",
"shop/bed",
"shop/carpet",
"shop/curtain",
"shop/flooring",
"shop/fireplace",
"shop/household",
"shop/household_linen",
"shop/houseware",
"shop/interior_decoration",
"shop/lighting",
"shop/window_blind",
],
),
(
"Shops",
"Bookshop",
"📚",
[
"shop/books",
"shop/stationery",
],
),
(
"Shops",
"Pet Shop",
"🐾",
[
"shop/pet",
],
),
(
"Shops",
"Sports & Outdoor",
"🏕️",
[
"shop/sports",
"shop/outdoor",
"shop/bicycle",
],
),
(
"Shops",
"Newsagent",
"📰",
[
"shop/newsagent",
"shop/tobacco",
],
),
(
"Shops",
"Department Store",
"🏬",
[
"shop/department_store",
"shop/mall",
"shop/variety_store",
"shop/discount",
],
),
(
"Shops",
"Gift & Hobby",
"🎁",
[
"shop/gift",
"shop/florist",
"shop/toys",
"shop/craft",
"shop/candles",
"shop/party",
"shop/art",
"shop/music",
"shop/musical_instrument",
"shop/antiques",
"shop/baby_goods",
"shop/fabric",
"shop/haberdashery",
"shop/wool",
"shop/pottery",
],
),
(
"Shops",
"Specialist Shop",
"🏪",
[
"shop/agrarian",
"shop/boat",
"shop/bookmaker",
"shop/building_materials",
"shop/camera",
"shop/car",
"shop/caravan",
"shop/catalogue",
"shop/collector",
"shop/copyshop",
"shop/country_store",
"shop/doors",
"shop/e-cigarette",
"shop/erotic",
"shop/esoteric",
"shop/fan",
"shop/fishing",
"shop/frame",
"shop/fuel",
"shop/gas",
"shop/hairdresser_supply",
"shop/military_surplus",
"shop/model",
"shop/money_lender",
"shop/motorcycle",
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
"shop/psychic",
"shop/pyrotechnics",
"shop/religion",
"shop/rental",
"shop/scuba_diving",
"shop/security",
"shop/sewing",
"shop/storage_rental",
"shop/swimming_pool",
"shop/telecommunication",
"shop/ticket",
"shop/tiles",
"shop/tool_hire",
"shop/trade",
"shop/trophy",
"shop/vacant",
"shop/video",
"shop/water_sports",
"shop/weapons",
"shop/wedding",
"shop/wholesale",
"shop/wigs",
"shop/yes",
],
),
# ── Services ─────────────────────────────────────────────
(
"Services",
"Hairdresser & Beauty",
"💇",
[
"shop/hairdresser",
"shop/beauty",
"shop/cosmetics",
"shop/massage",
"shop/perfumery",
],
),
(
"Services",
"Gym & Fitness",
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
"amenity/dojo",
"amenity/dancing_school",
],
),
(
"Services",
"Dry Cleaner & Laundry",
"👔",
[
"shop/dry_cleaning",
"shop/laundry",
"shop/tailor",
"shop/shoe_repair",
"shop/repair",
"craft/cleaning",
"craft/dressmaker",
"craft/shoemaker",
"craft/tailor",
],
),
(
"Services",
"Car Services",
"🔧",
[
"shop/car_repair",
"shop/car;car_repair",
"shop/car_parts",
"shop/motorcycle_repair",
"shop/tyres",
"amenity/car_wash",
"amenity/car_rental",
"amenity/car_sharing",
"amenity/bicycle_repair_station",
],
),
(
"Services",
"Post Office",
"🏤",
[
"amenity/post_office",
],
),
(
"Services",
"Vet & Pet Care",
"🐕",
[
"amenity/veterinary",
"shop/pet_grooming",
],
),
(
"Services",
"Bank",
"🏦",
[
"amenity/bank",
],
),
(
"Services",
"Travel Agent",
"✈️",
[
"shop/travel_agency",
"office/travel_agent",
],
),
(
"Services",
"Other",
"🛎️",
[
"shop/tattoo",
"shop/piercing",
"shop/locksmith",
"craft/key_cutter",
],
),
(
"Emergency Services",
"Police",
"👮",
["amenity/police"],
),
(
"Emergency Services",
"Fire Station",
"🚒",
["amenity/fire_station"],
),
(
"Emergency Services",
"Ambulance Station",
"🚑",
["emergency/ambulance_station"],
),
(
"Health",
"GP Surgery",
"👨‍⚕️",
[
"amenity/doctors",
"healthcare/doctor",
],
),
(
"Health",
"Dentist",
"🦷",
[
"amenity/dentist",
"healthcare/dentist",
],
),
(
"Health",
"Pharmacy",
"💊",
[
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
[
"amenity/hospital",
"amenity/clinic",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
],
),
(
"Health",
"Optician",
"👓",
[
"shop/optician",
"healthcare/optometrist",
"shop/hearing_aids",
"healthcare/audiologist",
],
),
(
"Health",
"Physiotherapy",
"🏃",
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
],
),
(
"Health",
"Counselling & Therapy",
"🧠",
[
"healthcare/counselling",
"healthcare/psychotherapist",
"office/therapist",
],
),
(
"Health",
"Care Home",
"🏠",
[
"amenity/care_home",
"amenity/nursing_home",
"office/home_care",
],
),
(
"Health",
"Medical & Mobility",
"",
[
"shop/medical_supply",
"shop/mobility",
"shop/mobility_scooter",
],
),
(
"Culture",
"Museum",
"🏛️",
[
"tourism/museum",
],
),
(
"Culture",
"Gallery",
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
],
),
(
"Culture",
"Library",
"📚",
[
"amenity/library",
],
),
(
"Culture",
"Place of Worship",
"",
[
"amenity/place_of_worship",
"building/church",
],
),
(
"Culture",
"Arts Centre",
"🎨",
[
"amenity/arts_centre",
],
),
(
"Culture",
"Zoo",
"🦁",
[
"tourism/zoo",
],
),
(
"Culture",
"Tourist Attraction",
"📸",
[
"tourism/attraction",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
],
),
(
"Education",
"School",
"🏫",
[
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
],
),
(
"Local Businesses",
"Hotel",
"🏨",
[
"tourism/hotel",
"tourism/hostel",
"tourism/guest_house",
"tourism/motel",
"tourism/camp_site",
],
),
(
"Local Businesses",
"Local Business",
"🛠️",
[
# Tradespeople
"craft/builder",
"craft/carpenter",
"craft/electrician",
"craft/electronics_repair",
"craft/floorer",
"craft/gardener",
"craft/glaziery",
"craft/hvac",
"craft/joiner",
"craft/locksmith",
"craft/painter",
"craft/plumber",
"craft/roofer",
"craft/window_construction",
"craft/agricultural_engines",
"craft/atelier",
"craft/blacksmith",
"craft/bookbinder",
"craft/caterer",
"craft/handicraft",
"craft/jeweller",
"craft/metal_construction",
"craft/photographer",
"craft/photographic_laboratory",
"craft/pottery",
"craft/printer",
"craft/sawmill",
"craft/scaffolder",
"craft/sculptor",
"craft/signmaker",
"craft/stonemason",
"craft/upholsterer",
"craft/watchmaker",
"craft/yes",
"shop/glaziery",
"shop/windows",
# Professional offices & estate agents
"shop/estate_agent",
"office/accountant",
"office/architect",
"office/construction_company",
"office/engineer",
"office/estate_agent",
"office/financial",
"office/financial_advisor",
"office/insurance",
"office/lawyer",
"office/mortgage",
"office/property_management",
"office/solicitor",
"office/surveyor",
"office/tax_advisor",
],
),
(
"Local Businesses",
"Offices",
"🏢",
[
"amenity/coworking_space",
"office/advertising_agency",
"office/association",
"office/charity",
"office/company",
"office/consulting",
"office/courier",
"office/coworking",
"office/design",
"office/diplomatic",
"office/educational_institution",
"office/employment_agency",
"office/energy_supplier",
"office/foundation",
"office/government",
"office/graphic_design",
"office/interior_design",
"office/it",
"office/logistics",
"office/marketing",
"office/moving_company",
"office/newspaper",
"office/ngo",
"office/notary",
"office/political_party",
"office/politician",
"office/recruitment",
"office/religion",
"office/research",
"office/security",
"office/taxi",
"office/telecommunication",
"office/union",
"office/university",
"office/vacant",
"office/web_design",
"office/yes",
],
),
# ── Other ────────────────────────────────────────────────
(
"Other",
"EV Charging",
"🔌",
[
"amenity/charging_station",
],
),
(
"Other",
"Fuel Station",
"",
[
"amenity/fuel",
],
),
(
"Other",
"Community Centre",
"🤝",
[
"amenity/community_centre",
"amenity/social_centre",
"amenity/townhall",
],
),
]
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
for group, name, emoji, osm_keys in _CATEGORIES
for osm_key in osm_keys
}
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Metro or Tram stop": "🚊",
}
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = (
lf.select("category").unique().collect(engine="streaming").to_series().to_list()
)
# Verify every non-dropped category has a mapping
unmapped = []
for cat in all_categories:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
emoji_mapping = {k: v[2] for k, v in CATEGORY_MAP.items()}
# Check no friendly names or emojis are empty (defensive)
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[1]]
if missing_names:
raise ValueError(f"Empty friendly names for: {missing_names}")
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[2]]
if missing_emojis:
raise ValueError(f"Empty emojis for: {missing_emojis}")
lf = lf.with_columns(
pl.col("category").replace_strict(group_mapping).alias("group"),
pl.col("category").replace_strict(name_mapping).alias("category"),
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
return pl.concat([lf, naptan], how="diagonal_relaxed")
def main():
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
print(f"\nCategories ({df['category'].n_unique()}):")
counts = (
df.group_by("group", "category", "emoji").len().sort("len", descending=True)
)
for row in counts.iter_rows(named=True):
print(f" [{row['group']}] {row['emoji']} {row['category']}: {row['len']:,}")
if __name__ == "__main__":
main()