perfect-postcode/pipeline/wide.py

143 lines
5.1 KiB
Python

"""Build a wide property dataframe and H3 aggregates from epc_pp output."""
import polars as pl
import h3
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, DATA_DIR, PROCESSED_DIR
def _build_wide() -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
print("Loading epc_pp...")
wide = pl.read_parquet(PROCESSED_DIR / "epc_pp.parquet")
print(f" {wide.shape[0]:,} rows")
# GPS coordinates + LSOA from ArcGIS
print("Joining GPS coordinates...")
arcgis = pl.read_parquet(DATA_DIR / "arcgis_data.parquet").select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"lsoa21",
)
wide = wide.join(arcgis, on="postcode", how="inner")
print(f" {wide.shape[0]:,} rows after GPS join")
# Journey times (optional)
journey_path = PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
if journey_path.exists():
print("Joining journey times...")
journey_times = pl.read_parquet(journey_path).select(
"postcode",
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
)
wide = wide.join(journey_times, on="postcode", how="left")
# Index of Deprivation
iod_path = DATA_DIR / "IoD2025_Scores.parquet"
if iod_path.exists():
print("Joining IoD scores...")
iod = pl.read_parquet(iod_path).drop(
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
)
# Rename IoD columns to clean snake_case
iod = iod.rename(_IOD_RENAMES)
wide = wide.join(
iod, left_on="lsoa21", right_on="lsoa_code", how="left"
)
return wide
_IOD_RENAMES = {
"LSOA code (2021)": "lsoa_code",
"Index of Multiple Deprivation (IMD) Score": "imd_score",
"Income Score (rate)": "income_score",
"Employment Score (rate)": "employment_score",
"Education, Skills and Training Score": "education_score",
"Health Deprivation and Disability Score": "health_score",
"Crime Score": "crime_score",
"Barriers to Housing and Services Score": "housing_barriers_score",
"Living Environment Score": "living_environment_score",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)": "idaci_score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)": "idaopi_score",
"Children and Young People Sub-domain Score": "children_young_people_score",
"Adult Skills Sub-domain Score": "adult_skills_score",
"Geographical Barriers Sub-domain Score": "geographical_barriers_score",
"Wider Barriers Sub-domain Score": "wider_barriers_score",
"Indoors Sub-domain Score": "indoors_score",
"Outdoors Sub-domain Score": "outdoors_score",
}
def _add_h3_indices(df: pl.DataFrame) -> pl.DataFrame:
"""Compute H3 indices from lat/lon for all configured resolutions."""
print("Computing H3 indices...")
# Compute per unique postcode for efficiency, then join back
postcodes = df.select("postcode", "lat", "lon").unique(subset=["postcode"])
for res in H3_RESOLUTIONS:
col_name = f"h3_res{res}"
postcodes = postcodes.with_columns(
pl.struct(["lat", "lon"])
.map_elements(
lambda x, r=res: h3.latlng_to_cell(x["lat"], x["lon"], r),
return_dtype=pl.Utf8,
)
.alias(col_name)
)
print(f" res{res}: {postcodes[col_name].n_unique():,} unique cells")
h3_cols = [f"h3_res{res}" for res in H3_RESOLUTIONS]
return df.join(
postcodes.select("postcode", *h3_cols), on="postcode", how="left"
)
def _aggregate_to_h3(df: pl.DataFrame) -> None:
"""Aggregate min/max of every numeric feature per H3 cell at each resolution."""
AGGREGATES_DIR.mkdir(parents=True, exist_ok=True)
exclude = {"lat", "lon"}
numeric_cols = [
col
for col, dtype in zip(df.columns, df.dtypes)
if dtype.is_numeric() and not col.startswith("h3_res") and col not in exclude
]
agg_exprs = [pl.len().alias("count")]
for col in numeric_cols:
agg_exprs.append(pl.col(col).min().alias(f"min_{col}"))
agg_exprs.append(pl.col(col).max().alias(f"max_{col}"))
print("Aggregating to H3 cells...")
for res in H3_RESOLUTIONS:
h3_col = f"h3_res{res}"
result = df.group_by(h3_col).agg(agg_exprs).rename({h3_col: "h3"})
path = AGGREGATES_DIR / f"res{res}.parquet"
result.write_parquet(path)
size_mb = path.stat().st_size / (1024 * 1024)
print(f" {path.name}: {result.shape[0]:,} cells ({size_mb:.1f} MB)")
def run():
"""Run the full wide pipeline: build wide df, compute H3, aggregate."""
wide = _build_wide()
wide_path = PROCESSED_DIR / "wide.parquet"
wide.write_parquet(wide_path)
size_mb = wide_path.stat().st_size / (1024 * 1024)
print(f"Wrote {wide_path} ({size_mb:.1f} MB)")
wide = _add_h3_indices(wide)
_aggregate_to_h3(wide)
print("Done.")
if __name__ == "__main__":
run()