Add POIs and journey times to map
This commit is contained in:
parent
7bfb1729bf
commit
500b9ef2aa
11 changed files with 914 additions and 177 deletions
|
|
@ -6,31 +6,47 @@ import polars as pl
|
|||
|
||||
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR
|
||||
|
||||
JOURNEY_COLS = [
|
||||
"public_transport_easy_minutes",
|
||||
"public_transport_quick_minutes",
|
||||
"cycling_minutes",
|
||||
]
|
||||
|
||||
AGGREGATE_COLS = [
|
||||
"median_pt_easy_minutes",
|
||||
"median_pt_quick_minutes",
|
||||
"median_cycling_minutes",
|
||||
"median_journey_minutes",
|
||||
]
|
||||
|
||||
|
||||
def aggregate_journey_times(
|
||||
journey_times_path: Path | None = None,
|
||||
postcodes_h3_path: Path | None = None,
|
||||
output_dir: Path | None = None,
|
||||
aggregates_dir: Path | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
Aggregate journey times by H3 cells at all resolutions.
|
||||
Add journey times to existing H3 aggregate parquet files.
|
||||
|
||||
Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode,
|
||||
then groups by H3 cell to compute median journey time.
|
||||
Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
|
||||
aggregates by H3 cell, then merges into existing res{N}.parquet files.
|
||||
"""
|
||||
journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet"
|
||||
journey_times_path = (
|
||||
journey_times_path
|
||||
or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
|
||||
)
|
||||
postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
|
||||
output_dir = output_dir or AGGREGATES_DIR
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
aggregates_dir = aggregates_dir or AGGREGATES_DIR
|
||||
|
||||
# Load journey times data
|
||||
journey_df = pl.read_parquet(journey_times_path).select(
|
||||
["postcode", "public_transport_minutes"]
|
||||
["postcode"] + JOURNEY_COLS
|
||||
)
|
||||
|
||||
# Filter out null journey times
|
||||
journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null())
|
||||
# Filter out rows where all journey time columns are null
|
||||
journey_df = journey_df.filter(
|
||||
pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
|
||||
)
|
||||
|
||||
if journey_df.height == 0:
|
||||
print("No valid journey times found")
|
||||
|
|
@ -48,31 +64,63 @@ def aggregate_journey_times(
|
|||
|
||||
print(f"Joined {joined_df.height} postcodes with journey times")
|
||||
|
||||
saved_paths = []
|
||||
updated_paths = []
|
||||
|
||||
for resolution in H3_RESOLUTIONS:
|
||||
h3_col = f"h3_res{resolution}"
|
||||
parquet_path = aggregates_dir / f"res{resolution}.parquet"
|
||||
|
||||
if not parquet_path.exists():
|
||||
print(f"Skipping resolution {resolution} - {parquet_path} not found")
|
||||
continue
|
||||
|
||||
if h3_col not in joined_df.columns:
|
||||
print(f"Skipping resolution {resolution} - column {h3_col} not found")
|
||||
continue
|
||||
|
||||
# Aggregate by H3 cell - compute median journey time
|
||||
agg_df = (
|
||||
# Aggregate journey times by H3 cell
|
||||
journey_agg = (
|
||||
joined_df.group_by(h3_col)
|
||||
.agg(
|
||||
pl.col("public_transport_minutes").median().alias("median_journey_minutes"),
|
||||
pl.col("public_transport_minutes").count().alias("journey_count"),
|
||||
pl.col("public_transport_easy_minutes")
|
||||
.median()
|
||||
.alias("median_pt_easy_minutes"),
|
||||
pl.col("public_transport_quick_minutes")
|
||||
.median()
|
||||
.alias("median_pt_quick_minutes"),
|
||||
pl.col("cycling_minutes")
|
||||
.median()
|
||||
.alias("median_cycling_minutes"),
|
||||
pl.col("public_transport_quick_minutes")
|
||||
.median()
|
||||
.alias("median_journey_minutes"),
|
||||
)
|
||||
.rename({h3_col: "h3"})
|
||||
)
|
||||
|
||||
output_path = output_dir / f"journey_times_res{resolution}.parquet"
|
||||
agg_df.write_parquet(output_path)
|
||||
saved_paths.append(output_path)
|
||||
print(f"Saved {agg_df.height} cells to {output_path}")
|
||||
# Load existing parquet
|
||||
existing_df = pl.read_parquet(parquet_path)
|
||||
|
||||
return saved_paths
|
||||
# Drop existing journey time columns if present
|
||||
existing_df = existing_df.drop(
|
||||
[c for c in AGGREGATE_COLS if c in existing_df.columns]
|
||||
)
|
||||
|
||||
# Left join journey times onto existing data
|
||||
updated_df = existing_df.join(journey_agg, on="h3", how="left")
|
||||
|
||||
# Save back to parquet
|
||||
updated_df.write_parquet(parquet_path)
|
||||
updated_paths.append(parquet_path)
|
||||
matched = updated_df.filter(
|
||||
pl.col("median_journey_minutes").is_not_null()
|
||||
).height
|
||||
print(
|
||||
f"Updated {parquet_path.name}: {matched} rows with journey times "
|
||||
f"(out of {updated_df.height} total)"
|
||||
)
|
||||
|
||||
return updated_paths
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue