perfect-postcode/pipeline/processors/journey_times_aggregator.py

"""Aggregate journey times data by H3 hexagonal cells."""

from pathlib import Path

import polars as pl

from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR

JOURNEY_COLS = [
    "public_transport_easy_minutes",
    "public_transport_quick_minutes",
    "cycling_minutes",
]

AGGREGATE_COLS = [
    "median_pt_easy_minutes",
    "median_pt_quick_minutes",
    "median_cycling_minutes",
    "median_journey_minutes",
]


def aggregate_journey_times(
    journey_times_path: Path | None = None,
    postcodes_h3_path: Path | None = None,
    aggregates_dir: Path | None = None,
) -> list[Path]:
    """
    Add journey times to existing H3 aggregate parquet files.

    Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
    aggregates by H3 cell, then merges into existing res{N}.parquet files.
    """
    journey_times_path = (
        journey_times_path
        or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
    )
    postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
    aggregates_dir = aggregates_dir or AGGREGATES_DIR

    # Load journey times data
    journey_df = pl.read_parquet(journey_times_path).select(
        ["postcode"] + JOURNEY_COLS
    )

    # Filter out rows where all journey time columns are null
    journey_df = journey_df.filter(
        pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
    )

    if journey_df.height == 0:
        print("No valid journey times found")
        return []

    # Load postcodes with H3 indices
    postcodes_df = pl.read_parquet(postcodes_h3_path)

    # Join on postcode to get H3 indices
    joined_df = journey_df.join(postcodes_df, on="postcode", how="inner")

    if joined_df.height == 0:
        print("No matching postcodes found")
        return []

    print(f"Joined {joined_df.height} postcodes with journey times")

    updated_paths = []

    for resolution in H3_RESOLUTIONS:
        h3_col = f"h3_res{resolution}"
        parquet_path = aggregates_dir / f"res{resolution}.parquet"

        if not parquet_path.exists():
            print(f"Skipping resolution {resolution} - {parquet_path} not found")
            continue

        if h3_col not in joined_df.columns:
            print(f"Skipping resolution {resolution} - column {h3_col} not found")
            continue

        # Aggregate journey times by H3 cell
        journey_agg = (
            joined_df.group_by(h3_col)
            .agg(
                pl.col("public_transport_easy_minutes")
                .median()
                .alias("median_pt_easy_minutes"),
                pl.col("public_transport_quick_minutes")
                .median()
                .alias("median_pt_quick_minutes"),
                pl.col("cycling_minutes")
                .median()
                .alias("median_cycling_minutes"),
                pl.col("public_transport_quick_minutes")
                .median()
                .alias("median_journey_minutes"),
            )
            .rename({h3_col: "h3"})
        )

        # Load existing parquet
        existing_df = pl.read_parquet(parquet_path)

        # Drop existing journey time columns if present
        existing_df = existing_df.drop(
            [c for c in AGGREGATE_COLS if c in existing_df.columns]
        )

        # Left join journey times onto existing data
        updated_df = existing_df.join(journey_agg, on="h3", how="left")

        # Save back to parquet
        updated_df.write_parquet(parquet_path)
        updated_paths.append(parquet_path)
        matched = updated_df.filter(
            pl.col("median_journey_minutes").is_not_null()
        ).height
        print(
            f"Updated {parquet_path.name}: {matched} rows with journey times "
            f"(out of {updated_df.height} total)"
        )

    return updated_paths


if __name__ == "__main__":
    aggregate_journey_times()