This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -3,7 +3,8 @@ import argparse
import polars as pl
from pathlib import Path
MIN_PRICE = 10_000
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
@ -45,20 +46,23 @@ def _build_wide(
rental_prices_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
wide = (
pl.scan_parquet(epc_pp_path)
.filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
.filter(
pl.col("latest_price").is_null() | (pl.col("latest_price") >= MIN_PRICE)
)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
# Remap terminated postcodes to nearest active successor
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = wide.join(
postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
).with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
).drop("new_postcode")
arcgis = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
.select(
pl.col("pcds").alias("postcode"),
"lat",
@ -67,7 +71,7 @@ def _build_wide(
"oa21",
)
)
wide = wide.join(arcgis, on="postcode", how="full", coalesce=True)
wide = wide.join(arcgis, on="postcode", how="left")
wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
@ -147,11 +151,6 @@ def _build_wide(
.with_columns(
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
)
.with_columns(
pl.col("noise_lden_db")
.fill_null(pl.col("noise_lden_db").min())
.alias("noise_lden_db"),
)
.select("postcode", "noise_lden_db")
)
wide = wide.join(noise, on="postcode", how="left")
@ -181,7 +180,7 @@ def _build_wide(
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")