"""Configuration for POI extraction from OpenStreetMap.""" from pathlib import Path # File paths DATA_DIR = Path(__file__).parent.parent.parent / "data_sources" GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf" OUTPUT_FILE = DATA_DIR / "uk_pois.parquet" # Geofabrik download URL for Great Britain (~1.1GB PBF, much faster than planet XML) GEOFABRIK_GB_URL = ( "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf" ) # UK bounding box (west, south, east, north) — used for way centroid filtering UK_BBOX_WEST = -7.57 UK_BBOX_SOUTH = 49.96 UK_BBOX_EAST = 1.68 UK_BBOX_NORTH = 58.64 # OSM tag mapping to categories # Maps (tag_key, tag_value) -> category name OSM_TAG_MAPPING: dict[tuple[str, str], str] = { # Education ("amenity", "school"): "school", ("amenity", "kindergarten"): "preschool", ("amenity", "college"): "college_university", ("amenity", "university"): "college_university", ("amenity", "library"): "library", ("amenity", "language_school"): "school", ("amenity", "music_school"): "school", ("amenity", "driving_school"): "school", # Healthcare ("amenity", "hospital"): "hospital", ("amenity", "clinic"): "public_health_clinic", ("amenity", "doctors"): "doctor", ("amenity", "dentist"): "dentist", ("amenity", "pharmacy"): "pharmacy", ("amenity", "veterinary"): "veterinary", ("amenity", "nursing_home"): "nursing_home", ("amenity", "social_facility"): "social_facility", # Transport ("railway", "station"): "train_station", ("railway", "halt"): "train_station", ("railway", "tram_stop"): "tram_stop", ("amenity", "bus_station"): "bus_station", ("amenity", "ferry_terminal"): "ferry_terminal", ("public_transport", "station"): "train_station", ("public_transport", "stop_position"): "bus_stop", ("station", "subway"): "metro_station", ("station", "light_rail"): "light_rail_station", ("aeroway", "aerodrome"): "airport", ("highway", "bus_stop"): "bus_stop", # Parks & Leisure ("leisure", "park"): "park", ("leisure", "nature_reserve"): "nature_reserve", ("leisure", "dog_park"): "dog_park", ("leisure", "playground"): "playground", ("leisure", "sports_centre"): "sports_centre", ("leisure", "swimming_pool"): "swimming_pool", ("leisure", "fitness_centre"): "gym", ("leisure", "golf_course"): "golf_course", ("leisure", "garden"): "garden", ("leisure", "marina"): "marina", ("boundary", "national_park"): "national_park", # Emergency ("amenity", "police"): "police_department", ("amenity", "fire_station"): "fire_department", # Shopping ("shop", "supermarket"): "supermarket", ("shop", "convenience"): "convenience_store", ("shop", "grocery"): "grocery_store", ("shop", "bakery"): "bakery", ("shop", "butcher"): "butcher", ("shop", "greengrocer"): "greengrocer", ("shop", "deli"): "deli", ("shop", "department_store"): "department_store", ("shop", "clothes"): "clothing_store", ("shop", "shoes"): "shoe_store", ("shop", "electronics"): "electronics_store", ("shop", "hardware"): "hardware_store", ("shop", "furniture"): "furniture_store", ("shop", "car"): "car_dealer", ("shop", "car_repair"): "car_repair", ("shop", "hairdresser"): "hairdresser", ("shop", "beauty"): "beauty_salon", ("shop", "optician"): "optician", ("shop", "newsagent"): "newsagent", ("shop", "books"): "bookshop", ("shop", "charity"): "charity_shop", ("shop", "alcohol"): "off_licence", ("shop", "laundry"): "laundry", ("shop", "dry_cleaning"): "dry_cleaning", ("shop", "mall"): "shopping_centre", # Food & Drink ("amenity", "restaurant"): "restaurant", ("amenity", "cafe"): "cafe", ("amenity", "pub"): "pub", ("amenity", "bar"): "bar", ("amenity", "fast_food"): "fast_food", ("amenity", "food_court"): "food_court", ("amenity", "ice_cream"): "ice_cream", ("amenity", "biergarten"): "beer_garden", # Finance ("amenity", "bank"): "bank", ("amenity", "atm"): "atm", ("amenity", "bureau_de_change"): "bureau_de_change", # Entertainment & Culture ("amenity", "cinema"): "cinema", ("amenity", "theatre"): "theatre", ("amenity", "nightclub"): "nightclub", ("amenity", "community_centre"): "community_centre", ("amenity", "arts_centre"): "arts_centre", ("tourism", "museum"): "museum", ("tourism", "gallery"): "gallery", ("tourism", "attraction"): "attraction", ("tourism", "zoo"): "zoo", ("tourism", "theme_park"): "theme_park", ("tourism", "viewpoint"): "viewpoint", # Accommodation ("tourism", "hotel"): "hotel", ("tourism", "hostel"): "hostel", ("tourism", "guest_house"): "guest_house", ("tourism", "camp_site"): "campsite", ("tourism", "caravan_site"): "caravan_site", # Religion ("amenity", "place_of_worship"): "place_of_worship", # Government & Public ("amenity", "townhall"): "town_hall", ("amenity", "courthouse"): "courthouse", ("amenity", "post_office"): "post_office", ("amenity", "prison"): "prison", ("amenity", "recycling"): "recycling", ("amenity", "waste_disposal"): "waste_disposal", ("amenity", "toilets"): "public_toilets", # Fuel ("amenity", "fuel"): "petrol_station", ("amenity", "charging_station"): "ev_charging", # Parking ("amenity", "parking"): "parking", ("amenity", "bicycle_parking"): "bicycle_parking", } # Build reverse lookup: tag_key -> set of tag_values we care about TAG_KEYS_TO_CHECK: dict[str, set[str]] = {} for (key, value), _ in OSM_TAG_MAPPING.items(): TAG_KEYS_TO_CHECK.setdefault(key, set()).add(value)