idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
    )


+def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
+    """Keep one row per (postcode, pp_address) — the most-recent transaction.
+
+    The terminated-postcode remap can map two distinct postcodes onto one active
+    successor, collapsing the same physical address onto a single
+    (postcode, pp_address) key with conflicting sale records. Keep the row with
+    the latest date_of_transfer so the headline price/date reflect the most
+    recent transaction; genuinely distinct addresses (a different pp_address) are
+    untouched. pp_address is non-null here (join_epc_pp filters it), so the key
+    never merges unrelated rows.
+    """
+    return wide.sort(
+        "date_of_transfer", descending=True, nulls_last=True
+    ).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
+
+
 def _filter_to_active_english_postcodes(
    wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
 ) -> pl.LazyFrame:
@ -837,38 +853,19 @@ def _join_area_side_tables(
    )

    # Crime is counted spatially per postcode (incidents within 50m of the
-    # postcode boundary), so it joins on postcode rather than LSOA.
-    base = base.join(crime, on="postcode", how="left")
-    serious_crime_cols = [
-        "Violence and sexual offences (avg/yr)",
-        "Robbery (avg/yr)",
-        "Burglary (avg/yr)",
-        "Possession of weapons (avg/yr)",
-    ]
-    minor_crime_cols = [
-        "Anti-social behaviour (avg/yr)",
-        "Criminal damage and arson (avg/yr)",
-        "Shoplifting (avg/yr)",
-        "Bicycle theft (avg/yr)",
-        "Theft from the person (avg/yr)",
-        "Other theft (avg/yr)",
-        "Vehicle crime (avg/yr)",
-        "Public order (avg/yr)",
-        "Drugs (avg/yr)",
-        "Other crime (avg/yr)",
-    ]
-    # The LEFT join leaves every per-type column null for postcodes absent from
-    # the crime table; sum_horizontal alone would fabricate a "zero crime"
-    # rollup there, so keep the rollup null when ALL components are null.
-    base = base.with_columns(
-        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
-        .then(None)
-        .otherwise(pl.sum_horizontal(serious_crime_cols))
-        .alias("serious_crime_avg_yr"),
-        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
-        .then(None)
-        .otherwise(pl.sum_horizontal(minor_crime_cols))
-        .alias("minor_crime_avg_yr"),
+    # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
+    # precomputes the Serious/Minor headline rollups as the mean of the by-year
+    # rollup bars; read those straight through (renamed to the internal columns
+    # _finalize_merged_columns expects) rather than re-summing the per-type
+    # avg/yr columns — summing divides each type by its OWN years-present and
+    # overstates the rollup when types differ in coverage. A postcode absent from
+    # the crime table keeps null rollups via the left join (no fabricated zero);
+    # the per-type avg/yr columns pass through unchanged for display.
+    base = base.join(crime, on="postcode", how="left").rename(
+        {
+            "Serious crime (avg/yr)": "serious_crime_avg_yr",
+            "Minor crime (avg/yr)": "minor_crime_avg_yr",
+        }
    )

    base = base.join(median_age, on="lsoa21", how="left")
@ -881,7 +878,37 @@ def _join_area_side_tables(
    )
    if tree_density is not None:
        base = base.join(tree_density, on="postcode", how="left")
-    return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
+
+    # Broadband is the one side table sourced straight from a third-party CSV
+    # (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
+    # step, so its postcode may drift in spacing/casing from the NSPL `pcds`
+    # base key. Normalize BOTH sides to the same canonical pcds form (reusing
+    # `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
+    # before joining, otherwise a real postcode silently misses and its
+    # `max_download_speed` reads as null "no data" downstream. Re-aggregate on
+    # the canonical key so two raw spellings collapsing to one key can't fan out
+    # the base; drop a null canonical key so an unparseable Ofcom row joins
+    # nothing rather than matching a null-key base row.
+    broadband_canonical = (
+        broadband.with_columns(
+            _canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
+        )
+        .drop_nulls("_bb_canonical_postcode")
+        .group_by("_bb_canonical_postcode")
+        .agg(pl.col("max_download_speed").max())
+    )
+    return (
+        base.with_columns(
+            _canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
+        )
+        .join(
+            broadband_canonical,
+            left_on="_base_canonical_postcode",
+            right_on="_bb_canonical_postcode",
+            how="left",
+        )
+        .drop("_base_canonical_postcode")
+    )


 def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
    )

    return (
-        epc_base.sort("inspection_date", descending=True)
+        epc_base.sort("inspection_date", descending=True, nulls_last=True)
        .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
        .first()
        .join(
@ -1918,6 +1945,10 @@ def _build(
    # terminated English postcodes are retained under their successor postcode.
    postcode_mapping = build_postcode_mapping(arcgis_path)
    wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
+    # The remap can collapse two terminated postcodes onto one active successor,
+    # duplicating a physical address's (postcode, pp_address) key; keep only the
+    # most-recent transaction per address before the per-postcode joins.
+    wide = _dedupe_collapsed_properties(wide)
    arcgis_raw = pl.scan_parquet(arcgis_path)
    arcgis = _active_english_postcode_area(arcgis_raw)
    active_postcodes = arcgis.select("postcode").unique()