Add POIs and journey times to map

2026-01-28 22:10:41 +00:00 · 2026-01-28 22:10:41 +00:00 · 500b9ef2aa
commit 500b9ef2aa
parent 7bfb1729bf
11 changed files with 914 additions and 177 deletions
--- a/pipeline/pois/init.py
+++ b/pipeline/pois/init.py
--- a/pipeline/pois/main.py
+++ b/pipeline/pois/main.py
@ -0,0 +1,181 @@
+"""Single-pass POI extraction from OSM PBF file using pyosmium."""
+
+import json
+import urllib.request
+
+import osmium
+import polars as pl
+from tqdm import tqdm
+
+from .config import (
+    GB_PBF_FILE,
+    GEOFABRIK_GB_URL,
+    OSM_TAG_MAPPING,
+    OUTPUT_FILE,
+    TAG_KEYS_TO_CHECK,
+    UK_BBOX_EAST,
+    UK_BBOX_NORTH,
+    UK_BBOX_SOUTH,
+    UK_BBOX_WEST,
+)
+
+# Approximate element count for the GB PBF extract (for progress estimation).
+ESTIMATED_ELEMENTS = 500_000_000
+
+
+def download_pbf() -> None:
+    """Download Great Britain PBF extract from Geofabrik."""
+    GB_PBF_FILE.parent.mkdir(parents=True, exist_ok=True)
+    tmp = GB_PBF_FILE.with_suffix(".pbf.tmp")
+    print(f"Downloading {GEOFABRIK_GB_URL}")
+
+    with (
+        tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
+        urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
+        open(tmp, "wb") as f,
+    ):
+        length = resp.headers.get("Content-Length")
+        if length:
+            bar.total = int(length)
+        while chunk := resp.read(1 << 20):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    tmp.rename(GB_PBF_FILE)
+    print(f"Saved to {GB_PBF_FILE}")
+
+
+class POIHandler(osmium.SimpleHandler):
+    """Streams OSM data, filters to UK bbox, extracts matching POIs."""
+
+    def __init__(self, progress: tqdm) -> None:
+        super().__init__()
+        self.pois: list[dict] = []
+        self._poi_count = 0
+        self._progress = progress
+
+    def _in_uk(self, lat: float, lon: float) -> bool:
+        return (
+            UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
+            and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
+        )
+
+    def _match_tags(self, tags: osmium.osm.TagList) -> str | None:
+        for key in TAG_KEYS_TO_CHECK:
+            if key in tags:
+                value = tags[key]
+                if value in TAG_KEYS_TO_CHECK[key]:
+                    return OSM_TAG_MAPPING[(key, value)]
+        return None
+
+    def _get_name(self, tags: osmium.osm.TagList) -> str:
+        return tags.get("name:en", tags.get("name", ""))
+
+    def _tags_to_json(self, tags: osmium.osm.TagList) -> str:
+        return json.dumps({tag.k: tag.v for tag in tags})
+
+    def _add_poi(
+        self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float
+    ) -> None:
+        self.pois.append(
+            {
+                "id": osm_id,
+                "name": self._get_name(tags),
+                "category": category,
+                "lat": lat,
+                "lng": lng,
+                "osm_tags": self._tags_to_json(tags),
+            }
+        )
+        self._poi_count += 1
+        self._progress.set_postfix(pois=f"{self._poi_count:,}", refresh=False)
+
+    def _tick(self) -> None:
+        self._progress.update(1)
+
+    def node(self, n: osmium.osm.Node) -> None:
+        self._tick()
+        if not n.location.valid:
+            return
+        lat, lon = n.location.lat, n.location.lon
+        if not self._in_uk(lat, lon):
+            return
+        category = self._match_tags(n.tags)
+        if category:
+            self._add_poi(f"n{n.id}", n.tags, category, lat, lon)
+
+    def way(self, w: osmium.osm.Way) -> None:
+        self._tick()
+        category = self._match_tags(w.tags)
+        if not category:
+            return
+
+        lats = []
+        lons = []
+        for node in w.nodes:
+            try:
+                lats.append(node.location.lat)
+                lons.append(node.location.lon)
+            except osmium.InvalidLocationError:
+                continue
+
+        if not lats:
+            return
+
+        centroid_lat = sum(lats) / len(lats)
+        centroid_lng = sum(lons) / len(lons)
+
+        if not self._in_uk(centroid_lat, centroid_lng):
+            return
+
+        self._add_poi(f"w{w.id}", w.tags, category, centroid_lat, centroid_lng)
+
+
+def main() -> None:
+    if not GB_PBF_FILE.exists():
+        download_pbf()
+
+    print(f"=== POI Extraction from {GB_PBF_FILE} ===")
+    print(
+        f"UK bbox: ({UK_BBOX_WEST}, {UK_BBOX_SOUTH}, {UK_BBOX_EAST}, {UK_BBOX_NORTH})"
+    )
+    print(f"Categories: {len(OSM_TAG_MAPPING)}")
+    print()
+
+    with tqdm(
+        total=ESTIMATED_ELEMENTS,
+        unit=" elements",
+        unit_scale=True,
+        desc="Streaming",
+        smoothing=0.05,
+        mininterval=1.0,
+    ) as progress:
+        handler = POIHandler(progress)
+        handler.apply_file(str(GB_PBF_FILE), locations=True)
+
+    print(f"Extracted {len(handler.pois):,} POIs")
+
+    if not handler.pois:
+        print("No POIs found.")
+        return
+
+    df = pl.DataFrame(handler.pois)
+
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+    df.write_parquet(OUTPUT_FILE)
+    print(f"Saved to {OUTPUT_FILE}")
+
+    print("\n=== Summary ===")
+    print(f"Total POIs: {len(df):,}")
+    print("\nPOIs by category:")
+    category_counts = (
+        df.group_by("category")
+        .agg(pl.len().alias("count"))
+        .sort("count", descending=True)
+    )
+    for row in category_counts.iter_rows(named=True):
+        print(f"  {row['category']}: {row['count']:,}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/pois/config.py
+++ b/pipeline/pois/config.py
@ -0,0 +1,147 @@
+"""Configuration for POI extraction from OpenStreetMap."""
+
+from pathlib import Path
+
+# File paths
+DATA_DIR = Path(__file__).parent.parent.parent / "data_sources"
+GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf"
+OUTPUT_FILE = DATA_DIR / "uk_pois.parquet"
+
+# Geofabrik download URL for Great Britain (~1.1GB PBF, much faster than planet XML)
+GEOFABRIK_GB_URL = (
+    "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
+)
+
+# UK bounding box (west, south, east, north) — used for way centroid filtering
+UK_BBOX_WEST = -7.57
+UK_BBOX_SOUTH = 49.96
+UK_BBOX_EAST = 1.68
+UK_BBOX_NORTH = 58.64
+
+# OSM tag mapping to categories
+# Maps (tag_key, tag_value) -> category name
+OSM_TAG_MAPPING: dict[tuple[str, str], str] = {
+    # Education
+    ("amenity", "school"): "school",
+    ("amenity", "kindergarten"): "preschool",
+    ("amenity", "college"): "college_university",
+    ("amenity", "university"): "college_university",
+    ("amenity", "library"): "library",
+    ("amenity", "language_school"): "school",
+    ("amenity", "music_school"): "school",
+    ("amenity", "driving_school"): "school",
+    # Healthcare
+    ("amenity", "hospital"): "hospital",
+    ("amenity", "clinic"): "public_health_clinic",
+    ("amenity", "doctors"): "doctor",
+    ("amenity", "dentist"): "dentist",
+    ("amenity", "pharmacy"): "pharmacy",
+    ("amenity", "veterinary"): "veterinary",
+    ("amenity", "nursing_home"): "nursing_home",
+    ("amenity", "social_facility"): "social_facility",
+    # Transport
+    ("railway", "station"): "train_station",
+    ("railway", "halt"): "train_station",
+    ("railway", "tram_stop"): "tram_stop",
+    ("amenity", "bus_station"): "bus_station",
+    ("amenity", "ferry_terminal"): "ferry_terminal",
+    ("public_transport", "station"): "train_station",
+    ("public_transport", "stop_position"): "bus_stop",
+    ("station", "subway"): "metro_station",
+    ("station", "light_rail"): "light_rail_station",
+    ("aeroway", "aerodrome"): "airport",
+    ("highway", "bus_stop"): "bus_stop",
+    # Parks & Leisure
+    ("leisure", "park"): "park",
+    ("leisure", "nature_reserve"): "nature_reserve",
+    ("leisure", "dog_park"): "dog_park",
+    ("leisure", "playground"): "playground",
+    ("leisure", "sports_centre"): "sports_centre",
+    ("leisure", "swimming_pool"): "swimming_pool",
+    ("leisure", "fitness_centre"): "gym",
+    ("leisure", "golf_course"): "golf_course",
+    ("leisure", "garden"): "garden",
+    ("leisure", "marina"): "marina",
+    ("boundary", "national_park"): "national_park",
+    # Emergency
+    ("amenity", "police"): "police_department",
+    ("amenity", "fire_station"): "fire_department",
+    # Shopping
+    ("shop", "supermarket"): "supermarket",
+    ("shop", "convenience"): "convenience_store",
+    ("shop", "grocery"): "grocery_store",
+    ("shop", "bakery"): "bakery",
+    ("shop", "butcher"): "butcher",
+    ("shop", "greengrocer"): "greengrocer",
+    ("shop", "deli"): "deli",
+    ("shop", "department_store"): "department_store",
+    ("shop", "clothes"): "clothing_store",
+    ("shop", "shoes"): "shoe_store",
+    ("shop", "electronics"): "electronics_store",
+    ("shop", "hardware"): "hardware_store",
+    ("shop", "furniture"): "furniture_store",
+    ("shop", "car"): "car_dealer",
+    ("shop", "car_repair"): "car_repair",
+    ("shop", "hairdresser"): "hairdresser",
+    ("shop", "beauty"): "beauty_salon",
+    ("shop", "optician"): "optician",
+    ("shop", "newsagent"): "newsagent",
+    ("shop", "books"): "bookshop",
+    ("shop", "charity"): "charity_shop",
+    ("shop", "alcohol"): "off_licence",
+    ("shop", "laundry"): "laundry",
+    ("shop", "dry_cleaning"): "dry_cleaning",
+    ("shop", "mall"): "shopping_centre",
+    # Food & Drink
+    ("amenity", "restaurant"): "restaurant",
+    ("amenity", "cafe"): "cafe",
+    ("amenity", "pub"): "pub",
+    ("amenity", "bar"): "bar",
+    ("amenity", "fast_food"): "fast_food",
+    ("amenity", "food_court"): "food_court",
+    ("amenity", "ice_cream"): "ice_cream",
+    ("amenity", "biergarten"): "beer_garden",
+    # Finance
+    ("amenity", "bank"): "bank",
+    ("amenity", "atm"): "atm",
+    ("amenity", "bureau_de_change"): "bureau_de_change",
+    # Entertainment & Culture
+    ("amenity", "cinema"): "cinema",
+    ("amenity", "theatre"): "theatre",
+    ("amenity", "nightclub"): "nightclub",
+    ("amenity", "community_centre"): "community_centre",
+    ("amenity", "arts_centre"): "arts_centre",
+    ("tourism", "museum"): "museum",
+    ("tourism", "gallery"): "gallery",
+    ("tourism", "attraction"): "attraction",
+    ("tourism", "zoo"): "zoo",
+    ("tourism", "theme_park"): "theme_park",
+    ("tourism", "viewpoint"): "viewpoint",
+    # Accommodation
+    ("tourism", "hotel"): "hotel",
+    ("tourism", "hostel"): "hostel",
+    ("tourism", "guest_house"): "guest_house",
+    ("tourism", "camp_site"): "campsite",
+    ("tourism", "caravan_site"): "caravan_site",
+    # Religion
+    ("amenity", "place_of_worship"): "place_of_worship",
+    # Government & Public
+    ("amenity", "townhall"): "town_hall",
+    ("amenity", "courthouse"): "courthouse",
+    ("amenity", "post_office"): "post_office",
+    ("amenity", "prison"): "prison",
+    ("amenity", "recycling"): "recycling",
+    ("amenity", "waste_disposal"): "waste_disposal",
+    ("amenity", "toilets"): "public_toilets",
+    # Fuel
+    ("amenity", "fuel"): "petrol_station",
+    ("amenity", "charging_station"): "ev_charging",
+    # Parking
+    ("amenity", "parking"): "parking",
+    ("amenity", "bicycle_parking"): "bicycle_parking",
+}
+
+# Build reverse lookup: tag_key -> set of tag_values we care about
+TAG_KEYS_TO_CHECK: dict[str, set[str]] = {}
+for (key, value), _ in OSM_TAG_MAPPING.items():
+    TAG_KEYS_TO_CHECK.setdefault(key, set()).add(value)
--- a/pipeline/processors/journey_times_aggregator.py
+++ b/pipeline/processors/journey_times_aggregator.py
@ -6,31 +6,47 @@ import polars as pl

 from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR

+JOURNEY_COLS = [
+    "public_transport_easy_minutes",
+    "public_transport_quick_minutes",
+    "cycling_minutes",
+]
+
+AGGREGATE_COLS = [
+    "median_pt_easy_minutes",
+    "median_pt_quick_minutes",
+    "median_cycling_minutes",
+    "median_journey_minutes",
+]
+

 def aggregate_journey_times(
    journey_times_path: Path | None = None,
    postcodes_h3_path: Path | None = None,
-    output_dir: Path | None = None,
+    aggregates_dir: Path | None = None,
 ) -> list[Path]:
    """
-    Aggregate journey times by H3 cells at all resolutions.
+    Add journey times to existing H3 aggregate parquet files.

-    Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode,
-    then groups by H3 cell to compute median journey time.
+    Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
+    aggregates by H3 cell, then merges into existing res{N}.parquet files.
    """
-    journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet"
+    journey_times_path = (
+        journey_times_path
+        or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
+    )
    postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
-    output_dir = output_dir or AGGREGATES_DIR
-
-    output_dir.mkdir(parents=True, exist_ok=True)
+    aggregates_dir = aggregates_dir or AGGREGATES_DIR

    # Load journey times data
    journey_df = pl.read_parquet(journey_times_path).select(
-        ["postcode", "public_transport_minutes"]
+        ["postcode"] + JOURNEY_COLS
    )

-    # Filter out null journey times
-    journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null())
+    # Filter out rows where all journey time columns are null
+    journey_df = journey_df.filter(
+        pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
+    )

    if journey_df.height == 0:
        print("No valid journey times found")
@ -48,31 +64,63 @@ def aggregate_journey_times(

    print(f"Joined {joined_df.height} postcodes with journey times")

-    saved_paths = []
+    updated_paths = []

    for resolution in H3_RESOLUTIONS:
        h3_col = f"h3_res{resolution}"
+        parquet_path = aggregates_dir / f"res{resolution}.parquet"
+
+        if not parquet_path.exists():
+            print(f"Skipping resolution {resolution} - {parquet_path} not found")
+            continue

        if h3_col not in joined_df.columns:
            print(f"Skipping resolution {resolution} - column {h3_col} not found")
            continue

-        # Aggregate by H3 cell - compute median journey time
-        agg_df = (
+        # Aggregate journey times by H3 cell
+        journey_agg = (
            joined_df.group_by(h3_col)
            .agg(
-                pl.col("public_transport_minutes").median().alias("median_journey_minutes"),
-                pl.col("public_transport_minutes").count().alias("journey_count"),
+                pl.col("public_transport_easy_minutes")
+                .median()
+                .alias("median_pt_easy_minutes"),
+                pl.col("public_transport_quick_minutes")
+                .median()
+                .alias("median_pt_quick_minutes"),
+                pl.col("cycling_minutes")
+                .median()
+                .alias("median_cycling_minutes"),
+                pl.col("public_transport_quick_minutes")
+                .median()
+                .alias("median_journey_minutes"),
            )
            .rename({h3_col: "h3"})
        )

-        output_path = output_dir / f"journey_times_res{resolution}.parquet"
-        agg_df.write_parquet(output_path)
-        saved_paths.append(output_path)
-        print(f"Saved {agg_df.height} cells to {output_path}")
+        # Load existing parquet
+        existing_df = pl.read_parquet(parquet_path)

-    return saved_paths
+        # Drop existing journey time columns if present
+        existing_df = existing_df.drop(
+            [c for c in AGGREGATE_COLS if c in existing_df.columns]
+        )
+
+        # Left join journey times onto existing data
+        updated_df = existing_df.join(journey_agg, on="h3", how="left")
+
+        # Save back to parquet
+        updated_df.write_parquet(parquet_path)
+        updated_paths.append(parquet_path)
+        matched = updated_df.filter(
+            pl.col("median_journey_minutes").is_not_null()
+        ).height
+        print(
+            f"Updated {parquet_path.name}: {matched} rows with journey times "
+            f"(out of {updated_df.height} total)"
+        )
+
+    return updated_paths


 if __name__ == "__main__":
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -5,6 +5,7 @@ import polars as pl
 from pipeline.sources.postcodes import save_postcodes
 from pipeline.sources.property_prices import PropertyPricesSource
 from pipeline.processors.h3_aggregator import save_aggregates
+from pipeline.processors.journey_times_aggregator import aggregate_journey_times


 def run_pipeline():
@ -14,22 +15,31 @@ def run_pipeline():
    print("=" * 60)

    # Step 1: Process postcodes with H3 indices
-    print("\n[1/3] Processing postcodes with H3 indices...")
+    print("\n[1/4] Processing postcodes with H3 indices...")
    postcodes_path = save_postcodes()
    print(f"      Saved: {postcodes_path}")

-    print("\n[2/3] Processing property prices...")
+    print("\n[2/4] Processing property prices...")
    postcodes = pl.scan_parquet(postcodes_path)
    property_source = PropertyPricesSource()
    properties = property_source.process(postcodes)
    print("      Joined property prices with postcodes")

-    print("\n[3/3] Aggregating at H3 resolutions...")
+    print("\n[3/4] Aggregating at H3 resolutions...")
    saved_paths = save_aggregates(properties)
    for path in saved_paths:
        size_mb = path.stat().st_size / (1024 * 1024)
        print(f"      Saved: {path.name} ({size_mb:.1f} MB)")

+    print("\n[4/4] Adding journey times to aggregates...")
+    updated_paths = aggregate_journey_times()
+    if updated_paths:
+        for path in updated_paths:
+            size_mb = path.stat().st_size / (1024 * 1024)
+            print(f"      Updated: {path.name} ({size_mb:.1f} MB)")
+    else:
+        print("      Skipped (no journey time data found)")
+

 if __name__ == "__main__":
    run_pipeline()