Add POIs and journey times to map

2026-01-28 22:10:41 +00:00 · 2026-01-28 22:10:41 +00:00 · 500b9ef2aa
commit 500b9ef2aa
parent 7bfb1729bf
11 changed files with 914 additions and 177 deletions
--- a/pipeline/processors/journey_times_aggregator.py
+++ b/pipeline/processors/journey_times_aggregator.py
@ -6,31 +6,47 @@ import polars as pl

 from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR

+JOURNEY_COLS = [
+    "public_transport_easy_minutes",
+    "public_transport_quick_minutes",
+    "cycling_minutes",
+]
+
+AGGREGATE_COLS = [
+    "median_pt_easy_minutes",
+    "median_pt_quick_minutes",
+    "median_cycling_minutes",
+    "median_journey_minutes",
+]
+

 def aggregate_journey_times(
    journey_times_path: Path | None = None,
    postcodes_h3_path: Path | None = None,
-    output_dir: Path | None = None,
+    aggregates_dir: Path | None = None,
 ) -> list[Path]:
    """
-    Aggregate journey times by H3 cells at all resolutions.
+    Add journey times to existing H3 aggregate parquet files.

-    Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode,
-    then groups by H3 cell to compute median journey time.
+    Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
+    aggregates by H3 cell, then merges into existing res{N}.parquet files.
    """
-    journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet"
+    journey_times_path = (
+        journey_times_path
+        or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
+    )
    postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
-    output_dir = output_dir or AGGREGATES_DIR
-
-    output_dir.mkdir(parents=True, exist_ok=True)
+    aggregates_dir = aggregates_dir or AGGREGATES_DIR

    # Load journey times data
    journey_df = pl.read_parquet(journey_times_path).select(
-        ["postcode", "public_transport_minutes"]
+        ["postcode"] + JOURNEY_COLS
    )

-    # Filter out null journey times
-    journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null())
+    # Filter out rows where all journey time columns are null
+    journey_df = journey_df.filter(
+        pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
+    )

    if journey_df.height == 0:
        print("No valid journey times found")
@ -48,31 +64,63 @@ def aggregate_journey_times(

    print(f"Joined {joined_df.height} postcodes with journey times")

-    saved_paths = []
+    updated_paths = []

    for resolution in H3_RESOLUTIONS:
        h3_col = f"h3_res{resolution}"
+        parquet_path = aggregates_dir / f"res{resolution}.parquet"
+
+        if not parquet_path.exists():
+            print(f"Skipping resolution {resolution} - {parquet_path} not found")
+            continue

        if h3_col not in joined_df.columns:
            print(f"Skipping resolution {resolution} - column {h3_col} not found")
            continue

-        # Aggregate by H3 cell - compute median journey time
-        agg_df = (
+        # Aggregate journey times by H3 cell
+        journey_agg = (
            joined_df.group_by(h3_col)
            .agg(
-                pl.col("public_transport_minutes").median().alias("median_journey_minutes"),
-                pl.col("public_transport_minutes").count().alias("journey_count"),
+                pl.col("public_transport_easy_minutes")
+                .median()
+                .alias("median_pt_easy_minutes"),
+                pl.col("public_transport_quick_minutes")
+                .median()
+                .alias("median_pt_quick_minutes"),
+                pl.col("cycling_minutes")
+                .median()
+                .alias("median_cycling_minutes"),
+                pl.col("public_transport_quick_minutes")
+                .median()
+                .alias("median_journey_minutes"),
            )
            .rename({h3_col: "h3"})
        )

-        output_path = output_dir / f"journey_times_res{resolution}.parquet"
-        agg_df.write_parquet(output_path)
-        saved_paths.append(output_path)
-        print(f"Saved {agg_df.height} cells to {output_path}")
+        # Load existing parquet
+        existing_df = pl.read_parquet(parquet_path)

-    return saved_paths
+        # Drop existing journey time columns if present
+        existing_df = existing_df.drop(
+            [c for c in AGGREGATE_COLS if c in existing_df.columns]
+        )
+
+        # Left join journey times onto existing data
+        updated_df = existing_df.join(journey_agg, on="h3", how="left")
+
+        # Save back to parquet
+        updated_df.write_parquet(parquet_path)
+        updated_paths.append(parquet_path)
+        matched = updated_df.filter(
+            pl.col("median_journey_minutes").is_not_null()
+        ).height
+        print(
+            f"Updated {parquet_path.name}: {matched} rows with journey times "
+            f"(out of {updated_df.height} total)"
+        )
+
+    return updated_paths


 if __name__ == "__main__":