This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
)
def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
"""Keep one row per (postcode, pp_address) — the most-recent transaction.
The terminated-postcode remap can map two distinct postcodes onto one active
successor, collapsing the same physical address onto a single
(postcode, pp_address) key with conflicting sale records. Keep the row with
the latest date_of_transfer so the headline price/date reflect the most
recent transaction; genuinely distinct addresses (a different pp_address) are
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
never merges unrelated rows.
"""
return wide.sort(
"date_of_transfer", descending=True, nulls_last=True
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
def _filter_to_active_english_postcodes(
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
) -> pl.LazyFrame:
@ -837,38 +853,19 @@ def _join_area_side_tables(
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
serious_crime_cols = [
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
]
minor_crime_cols = [
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
]
# The LEFT join leaves every per-type column null for postcodes absent from
# the crime table; sum_horizontal alone would fabricate a "zero crime"
# rollup there, so keep the rollup null when ALL components are null.
base = base.with_columns(
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(serious_crime_cols))
.alias("serious_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(minor_crime_cols))
.alias("minor_crime_avg_yr"),
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
# precomputes the Serious/Minor headline rollups as the mean of the by-year
# rollup bars; read those straight through (renamed to the internal columns
# _finalize_merged_columns expects) rather than re-summing the per-type
# avg/yr columns — summing divides each type by its OWN years-present and
# overstates the rollup when types differ in coverage. A postcode absent from
# the crime table keeps null rollups via the left join (no fabricated zero);
# the per-type avg/yr columns pass through unchanged for display.
base = base.join(crime, on="postcode", how="left").rename(
{
"Serious crime (avg/yr)": "serious_crime_avg_yr",
"Minor crime (avg/yr)": "minor_crime_avg_yr",
}
)
base = base.join(median_age, on="lsoa21", how="left")
@ -881,7 +878,37 @@ def _join_area_side_tables(
)
if tree_density is not None:
base = base.join(tree_density, on="postcode", how="left")
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
# Broadband is the one side table sourced straight from a third-party CSV
# (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
# step, so its postcode may drift in spacing/casing from the NSPL `pcds`
# base key. Normalize BOTH sides to the same canonical pcds form (reusing
# `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
# before joining, otherwise a real postcode silently misses and its
# `max_download_speed` reads as null "no data" downstream. Re-aggregate on
# the canonical key so two raw spellings collapsing to one key can't fan out
# the base; drop a null canonical key so an unparseable Ofcom row joins
# nothing rather than matching a null-key base row.
broadband_canonical = (
broadband.with_columns(
_canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
)
.drop_nulls("_bb_canonical_postcode")
.group_by("_bb_canonical_postcode")
.agg(pl.col("max_download_speed").max())
)
return (
base.with_columns(
_canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
)
.join(
broadband_canonical,
left_on="_base_canonical_postcode",
right_on="_bb_canonical_postcode",
how="left",
)
.drop("_base_canonical_postcode")
)
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
)
return (
epc_base.sort("inspection_date", descending=True)
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
.first()
.join(
@ -1918,6 +1945,10 @@ def _build(
# terminated English postcodes are retained under their successor postcode.
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
# The remap can collapse two terminated postcodes onto one active successor,
# duplicating a physical address's (postcode, pp_address) key; keep only the
# most-recent transaction per address before the per-postcode joins.
wide = _dedupe_collapsed_properties(wide)
arcgis_raw = pl.scan_parquet(arcgis_path)
arcgis = _active_english_postcode_area(arcgis_raw)
active_postcodes = arcgis.select("postcode").unique()