Add POIs and journey times to map

This commit is contained in:
Andras Schmelczer 2026-01-28 22:10:41 +00:00
parent 7bfb1729bf
commit 500b9ef2aa
11 changed files with 914 additions and 177 deletions

View file

181
pipeline/pois/__main__.py Normal file
View file

@ -0,0 +1,181 @@
"""Single-pass POI extraction from OSM PBF file using pyosmium."""
import json
import urllib.request
import osmium
import polars as pl
from tqdm import tqdm
from .config import (
GB_PBF_FILE,
GEOFABRIK_GB_URL,
OSM_TAG_MAPPING,
OUTPUT_FILE,
TAG_KEYS_TO_CHECK,
UK_BBOX_EAST,
UK_BBOX_NORTH,
UK_BBOX_SOUTH,
UK_BBOX_WEST,
)
# Approximate element count for the GB PBF extract (for progress estimation).
ESTIMATED_ELEMENTS = 500_000_000
def download_pbf() -> None:
"""Download Great Britain PBF extract from Geofabrik."""
GB_PBF_FILE.parent.mkdir(parents=True, exist_ok=True)
tmp = GB_PBF_FILE.with_suffix(".pbf.tmp")
print(f"Downloading {GEOFABRIK_GB_URL}")
with (
tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
open(tmp, "wb") as f,
):
length = resp.headers.get("Content-Length")
if length:
bar.total = int(length)
while chunk := resp.read(1 << 20):
f.write(chunk)
bar.update(len(chunk))
tmp.rename(GB_PBF_FILE)
print(f"Saved to {GB_PBF_FILE}")
class POIHandler(osmium.SimpleHandler):
"""Streams OSM data, filters to UK bbox, extracts matching POIs."""
def __init__(self, progress: tqdm) -> None:
super().__init__()
self.pois: list[dict] = []
self._poi_count = 0
self._progress = progress
def _in_uk(self, lat: float, lon: float) -> bool:
return (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
)
def _match_tags(self, tags: osmium.osm.TagList) -> str | None:
for key in TAG_KEYS_TO_CHECK:
if key in tags:
value = tags[key]
if value in TAG_KEYS_TO_CHECK[key]:
return OSM_TAG_MAPPING[(key, value)]
return None
def _get_name(self, tags: osmium.osm.TagList) -> str:
return tags.get("name:en", tags.get("name", ""))
def _tags_to_json(self, tags: osmium.osm.TagList) -> str:
return json.dumps({tag.k: tag.v for tag in tags})
def _add_poi(
self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float
) -> None:
self.pois.append(
{
"id": osm_id,
"name": self._get_name(tags),
"category": category,
"lat": lat,
"lng": lng,
"osm_tags": self._tags_to_json(tags),
}
)
self._poi_count += 1
self._progress.set_postfix(pois=f"{self._poi_count:,}", refresh=False)
def _tick(self) -> None:
self._progress.update(1)
def node(self, n: osmium.osm.Node) -> None:
self._tick()
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not self._in_uk(lat, lon):
return
category = self._match_tags(n.tags)
if category:
self._add_poi(f"n{n.id}", n.tags, category, lat, lon)
def way(self, w: osmium.osm.Way) -> None:
self._tick()
category = self._match_tags(w.tags)
if not category:
return
lats = []
lons = []
for node in w.nodes:
try:
lats.append(node.location.lat)
lons.append(node.location.lon)
except osmium.InvalidLocationError:
continue
if not lats:
return
centroid_lat = sum(lats) / len(lats)
centroid_lng = sum(lons) / len(lons)
if not self._in_uk(centroid_lat, centroid_lng):
return
self._add_poi(f"w{w.id}", w.tags, category, centroid_lat, centroid_lng)
def main() -> None:
if not GB_PBF_FILE.exists():
download_pbf()
print(f"=== POI Extraction from {GB_PBF_FILE} ===")
print(
f"UK bbox: ({UK_BBOX_WEST}, {UK_BBOX_SOUTH}, {UK_BBOX_EAST}, {UK_BBOX_NORTH})"
)
print(f"Categories: {len(OSM_TAG_MAPPING)}")
print()
with tqdm(
total=ESTIMATED_ELEMENTS,
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress)
handler.apply_file(str(GB_PBF_FILE), locations=True)
print(f"Extracted {len(handler.pois):,} POIs")
if not handler.pois:
print("No POIs found.")
return
df = pl.DataFrame(handler.pois)
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(OUTPUT_FILE)
print(f"Saved to {OUTPUT_FILE}")
print("\n=== Summary ===")
print(f"Total POIs: {len(df):,}")
print("\nPOIs by category:")
category_counts = (
df.group_by("category")
.agg(pl.len().alias("count"))
.sort("count", descending=True)
)
for row in category_counts.iter_rows(named=True):
print(f" {row['category']}: {row['count']:,}")
if __name__ == "__main__":
main()

147
pipeline/pois/config.py Normal file
View file

@ -0,0 +1,147 @@
"""Configuration for POI extraction from OpenStreetMap."""
from pathlib import Path
# File paths
DATA_DIR = Path(__file__).parent.parent.parent / "data_sources"
GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf"
OUTPUT_FILE = DATA_DIR / "uk_pois.parquet"
# Geofabrik download URL for Great Britain (~1.1GB PBF, much faster than planet XML)
GEOFABRIK_GB_URL = (
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
)
# UK bounding box (west, south, east, north) — used for way centroid filtering
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
# OSM tag mapping to categories
# Maps (tag_key, tag_value) -> category name
OSM_TAG_MAPPING: dict[tuple[str, str], str] = {
# Education
("amenity", "school"): "school",
("amenity", "kindergarten"): "preschool",
("amenity", "college"): "college_university",
("amenity", "university"): "college_university",
("amenity", "library"): "library",
("amenity", "language_school"): "school",
("amenity", "music_school"): "school",
("amenity", "driving_school"): "school",
# Healthcare
("amenity", "hospital"): "hospital",
("amenity", "clinic"): "public_health_clinic",
("amenity", "doctors"): "doctor",
("amenity", "dentist"): "dentist",
("amenity", "pharmacy"): "pharmacy",
("amenity", "veterinary"): "veterinary",
("amenity", "nursing_home"): "nursing_home",
("amenity", "social_facility"): "social_facility",
# Transport
("railway", "station"): "train_station",
("railway", "halt"): "train_station",
("railway", "tram_stop"): "tram_stop",
("amenity", "bus_station"): "bus_station",
("amenity", "ferry_terminal"): "ferry_terminal",
("public_transport", "station"): "train_station",
("public_transport", "stop_position"): "bus_stop",
("station", "subway"): "metro_station",
("station", "light_rail"): "light_rail_station",
("aeroway", "aerodrome"): "airport",
("highway", "bus_stop"): "bus_stop",
# Parks & Leisure
("leisure", "park"): "park",
("leisure", "nature_reserve"): "nature_reserve",
("leisure", "dog_park"): "dog_park",
("leisure", "playground"): "playground",
("leisure", "sports_centre"): "sports_centre",
("leisure", "swimming_pool"): "swimming_pool",
("leisure", "fitness_centre"): "gym",
("leisure", "golf_course"): "golf_course",
("leisure", "garden"): "garden",
("leisure", "marina"): "marina",
("boundary", "national_park"): "national_park",
# Emergency
("amenity", "police"): "police_department",
("amenity", "fire_station"): "fire_department",
# Shopping
("shop", "supermarket"): "supermarket",
("shop", "convenience"): "convenience_store",
("shop", "grocery"): "grocery_store",
("shop", "bakery"): "bakery",
("shop", "butcher"): "butcher",
("shop", "greengrocer"): "greengrocer",
("shop", "deli"): "deli",
("shop", "department_store"): "department_store",
("shop", "clothes"): "clothing_store",
("shop", "shoes"): "shoe_store",
("shop", "electronics"): "electronics_store",
("shop", "hardware"): "hardware_store",
("shop", "furniture"): "furniture_store",
("shop", "car"): "car_dealer",
("shop", "car_repair"): "car_repair",
("shop", "hairdresser"): "hairdresser",
("shop", "beauty"): "beauty_salon",
("shop", "optician"): "optician",
("shop", "newsagent"): "newsagent",
("shop", "books"): "bookshop",
("shop", "charity"): "charity_shop",
("shop", "alcohol"): "off_licence",
("shop", "laundry"): "laundry",
("shop", "dry_cleaning"): "dry_cleaning",
("shop", "mall"): "shopping_centre",
# Food & Drink
("amenity", "restaurant"): "restaurant",
("amenity", "cafe"): "cafe",
("amenity", "pub"): "pub",
("amenity", "bar"): "bar",
("amenity", "fast_food"): "fast_food",
("amenity", "food_court"): "food_court",
("amenity", "ice_cream"): "ice_cream",
("amenity", "biergarten"): "beer_garden",
# Finance
("amenity", "bank"): "bank",
("amenity", "atm"): "atm",
("amenity", "bureau_de_change"): "bureau_de_change",
# Entertainment & Culture
("amenity", "cinema"): "cinema",
("amenity", "theatre"): "theatre",
("amenity", "nightclub"): "nightclub",
("amenity", "community_centre"): "community_centre",
("amenity", "arts_centre"): "arts_centre",
("tourism", "museum"): "museum",
("tourism", "gallery"): "gallery",
("tourism", "attraction"): "attraction",
("tourism", "zoo"): "zoo",
("tourism", "theme_park"): "theme_park",
("tourism", "viewpoint"): "viewpoint",
# Accommodation
("tourism", "hotel"): "hotel",
("tourism", "hostel"): "hostel",
("tourism", "guest_house"): "guest_house",
("tourism", "camp_site"): "campsite",
("tourism", "caravan_site"): "caravan_site",
# Religion
("amenity", "place_of_worship"): "place_of_worship",
# Government & Public
("amenity", "townhall"): "town_hall",
("amenity", "courthouse"): "courthouse",
("amenity", "post_office"): "post_office",
("amenity", "prison"): "prison",
("amenity", "recycling"): "recycling",
("amenity", "waste_disposal"): "waste_disposal",
("amenity", "toilets"): "public_toilets",
# Fuel
("amenity", "fuel"): "petrol_station",
("amenity", "charging_station"): "ev_charging",
# Parking
("amenity", "parking"): "parking",
("amenity", "bicycle_parking"): "bicycle_parking",
}
# Build reverse lookup: tag_key -> set of tag_values we care about
TAG_KEYS_TO_CHECK: dict[str, set[str]] = {}
for (key, value), _ in OSM_TAG_MAPPING.items():
TAG_KEYS_TO_CHECK.setdefault(key, set()).add(value)