idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -14,6 +14,7 @@ from pipeline.transform.merge import (
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
    _coalesce_direct_epc_columns,
+    _dedupe_collapsed_properties,
    _filter_to_active_english_postcodes,
    _join_area_side_tables,
    _finalize_listings,
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
        _validate_postcode_feature_output(postcode_df, expected_postcode_count=2)


+def test_postcode_feature_validation_rejects_wrong_count() -> None:
+    # The universe-size invariant: the postcode feature output must contain
+    # EXACTLY the active-England universe. Too few rows (silently dropped
+    # postcodes) and too many / duplicated rows (a join fan-out) must both fail,
+    # so neither a truncated build nor a one-to-many join can ship.
+    too_few = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA"],
+            "lat": [51.0],
+            "lon": [-0.1],
+            "ctry25cd": ["E92000001"],
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(too_few, expected_postcode_count=2)
+
+    too_many = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
+            "lat": [51.0, 52.0, 53.0],
+            "lon": [-0.1, -0.2, -0.3],
+            "ctry25cd": ["E92000001"] * 3,
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(too_many, expected_postcode_count=2)
+
+    # Right row count but a duplicated key (n_unique < height) -- the signature of
+    # a join fan-out.
+    duplicated = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", "AA1 1AA"],
+            "lat": [51.0, 51.0],
+            "lon": [-0.1, -0.1],
+            "ctry25cd": ["E92000001", "E92000001"],
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(duplicated, expected_postcode_count=2)
+
+
+def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
+    # Soundness: with side tables unique on their join key, the per-postcode
+    # feature joins emit exactly one row per postcode (no fan-out). A fan-out here
+    # would inflate the postcode universe above the active-England count -- the
+    # failure the universe assertion above is the backstop for.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
+
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "Serious crime (avg/yr)": [1.0, 2.0],
+            "Minor crime (avg/yr)": [3.0, 4.0],
+        }
+    )
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=pl.LazyFrame(
+            {
+                "bb_postcode": ["AA1 1AA", "BB2 2BB"],
+                "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
+            }
+        ),
+    ).collect()
+
+    # One row per postcode in -> one row out; the universe is not inflated.
+    assert joined.height == 2
+    assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
+
+
+def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
+    # Broadband comes straight from Ofcom's CSV, so its postcode can drift in
+    # spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
+    # to the same canonical form so a real postcode populates
+    # `max_download_speed` instead of silently missing the left join.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AB1 2CD", "EF3 4GH"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
+
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AB1 2CD", "EF3 4GH"],
+            "Serious crime (avg/yr)": [1.0, 2.0],
+            "Minor crime (avg/yr)": [3.0, 4.0],
+        }
+    )
+    # AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
+    # raw spellings that canonicalize to one key (the max speed must win, with
+    # no fan-out of the base row).
+    broadband = pl.LazyFrame(
+        {
+            "bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
+            "max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
+        }
+    )
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=broadband,
+    ).collect()
+
+    # No fan-out: still one row per base postcode.
+    assert joined.height == 2
+    speeds = dict(
+        zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
+    )
+    # Spacing/casing drift still joins.
+    assert speeds["AB1 2CD"] == 300
+    # Two raw spellings collapse to one canonical key; the max wins.
+    assert speeds["EF3 4GH"] == 1000
+    # The temporary canonical join key is not leaked into the output schema.
+    assert "_base_canonical_postcode" not in joined.columns
+    assert "_bb_canonical_postcode" not in joined.columns
+    assert "bb_postcode" not in joined.columns
+
+
 def test_listed_building_feature_is_property_level() -> None:
    assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS

@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:

 def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    # The crime table is LEFT-joined per postcode; a postcode absent from it
-    # must NOT be fabricated as "zero crime" (the safest value). When every
-    # per-type column is null the Serious/Minor rollups must stay null.
+    # must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
+    # rollups are precomputed in crime_spatial (the mean of the by-year rollup
+    # bars), so the merge reads them straight through; a missing postcode leaves
+    # them null.
    base = pl.LazyFrame(
        {
            "postcode": ["AA1 1AA", "BB2 2BB"],
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    def _by_postcode(extra: dict) -> pl.LazyFrame:
        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})

-    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
+    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
+    # rollup headlines are precomputed values (deliberately NOT the per-type sum,
+    # which would be 10.0 each) so this test proves the merge consumes the
+    # precomputed column rather than re-summing per-type columns.
    crime = pl.LazyFrame(
        {
            "postcode": ["AA1 1AA"],
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
            "Public order (avg/yr)": [1.0],
            "Drugs (avg/yr)": [1.0],
            "Other crime (avg/yr)": [1.0],
+            "Serious crime (avg/yr)": [7.5],
+            "Minor crime (avg/yr)": [4.2],
        }
    )

@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
        school_proximity=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
-        broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
+        broadband=pl.LazyFrame(
+            {
+                "bb_postcode": ["AA1 1AA", "BB2 2BB"],
+                "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
+            }
+        ),
    ).collect()

    by_postcode = {
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
            "postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
        ).iter_rows(named=True)
    }
-    # Present postcode: rollups are the component sums (1+2+3+4, 10×1).
-    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
-    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
+    # Present postcode: rollups are the precomputed headline values, read through
+    # unchanged (NOT the per-type sum of 10.0).
+    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
+    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
    # Missing postcode: rollups stay null rather than fabricating 0.0.
    assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
    assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None


+def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
+    # The terminated-postcode remap can merge two distinct postcodes onto one
+    # active successor, collapsing the same physical address onto a single
+    # (postcode, pp_address) key with conflicting sale records. The dedup must
+    # keep exactly one row per (postcode, pp_address) -- the most recent
+    # transaction -- and must not collapse genuinely distinct addresses.
+    from datetime import datetime
+
+    wide = pl.LazyFrame(
+        {
+            "postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
+            "pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
+            "date_of_transfer": [
+                datetime(1990, 1, 1),
+                datetime(2015, 6, 1),
+                datetime(2000, 1, 1),
+            ],
+            "latest_price": [1_587_700, 4_500_000, 250_000],
+        }
+    )
+
+    out = _dedupe_collapsed_properties(wide).collect()
+
+    # One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
+    assert out.height == 2
+    assert out.select(["postcode", "pp_address"]).is_unique().all()
+    by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
+    # The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
+    # not an arbitrary one.
+    assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
+    assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
+    # A genuinely distinct address in the same postcode is untouched.
+    assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
+
+
 def _property_candidates(rows: list[dict]) -> pl.DataFrame:
    base = {
        "postcode": "AA1 1AA",