idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -13,7 +13,9 @@ from pipeline.transform.merge import (
    _active_english_postcode_area,
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
+    _coalesce_direct_epc_columns,
    _filter_to_active_english_postcodes,
+    _join_area_side_tables,
    _finalize_listings,
    _integrate_listings,
    _match_direct_epc,
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
    assert loaded["_actual_lat"].to_list() == [51.5]


+def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
+    # A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
+    # the same digits-only key as `_normalize_uprn` on the candidate side, so
+    # the exact UPRN match is not lost. Naively stringifying "100023336956.0"
+    # and stripping non-digits would yield "1000233369560" (a bogus trailing
+    # zero) which never collides with the candidate key "100023336956".
+    listings_path = tmp_path / "listings.parquet"
+    arcgis_path = tmp_path / "arcgis.parquet"
+    _sample_listings_frame().with_columns(
+        pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
+    ).write_parquet(listings_path)
+    _stub_arcgis(arcgis_path)
+
+    loaded = _load_listings_for_merge(listings_path, arcgis_path)
+
+    assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
+    assert loaded["_listing_uprn"].to_list() == ["100023336956"]
+
+
 def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
    tmp_path,
 ) -> None:
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
    assert _normalize_uprn(float("nan")) is None


+def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
+    # The raw property value is fill_null("No") upstream, so a plain coalesce
+    # would let a non-null "No" override a directly-matched listing "Yes".
+    # "Former council house" should fire if EITHER side says "Yes".
+    none_col = [None] * 5
+    wide = pl.LazyFrame(
+        {
+            "was_council_house": ["No", "Yes", "No", None, None],
+            "_direct_was_council_house": ["Yes", "No", None, "Yes", None],
+            # An unrelated direct-EPC column keeps the plain-coalesce behaviour.
+            "current_energy_rating": [None, "C", "D", None, None],
+            "_direct_current_energy_rating": ["B", "A", None, "E", None],
+            # _coalesce_direct_epc_columns coalesces every pair in
+            # _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
+            "epc_address": none_col,
+            "_direct_epc_address": none_col,
+            "potential_energy_rating": none_col,
+            "_direct_potential_energy_rating": none_col,
+            "total_floor_area": none_col,
+            "_direct_total_floor_area": none_col,
+            "number_habitable_rooms": none_col,
+            "_direct_number_habitable_rooms": none_col,
+            "floor_height": none_col,
+            "_direct_floor_height": none_col,
+            "construction_age_band": none_col,
+            "_direct_construction_age_band": none_col,
+            "is_construction_date_approximate": none_col,
+            "_direct_is_construction_date_approximate": none_col,
+        }
+    )
+
+    result = _coalesce_direct_epc_columns(wide).collect()
+
+    assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
+    # Plain coalesce (raw wins when non-null) is untouched for other columns.
+    assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
+
+
+def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
+    # The crime table is LEFT-joined per postcode; a postcode absent from it
+    # must NOT be fabricated as "zero crime" (the safest value). When every
+    # per-type column is null the Serious/Minor rollups must stay null.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
+
+    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA"],
+            "Violence and sexual offences (avg/yr)": [1.0],
+            "Robbery (avg/yr)": [2.0],
+            "Burglary (avg/yr)": [3.0],
+            "Possession of weapons (avg/yr)": [4.0],
+            "Anti-social behaviour (avg/yr)": [1.0],
+            "Criminal damage and arson (avg/yr)": [1.0],
+            "Shoplifting (avg/yr)": [1.0],
+            "Bicycle theft (avg/yr)": [1.0],
+            "Theft from the person (avg/yr)": [1.0],
+            "Other theft (avg/yr)": [1.0],
+            "Vehicle crime (avg/yr)": [1.0],
+            "Public order (avg/yr)": [1.0],
+            "Drugs (avg/yr)": [1.0],
+            "Other crime (avg/yr)": [1.0],
+        }
+    )
+
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
+    ).collect()
+
+    by_postcode = {
+        row["postcode"]: row
+        for row in joined.select(
+            "postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
+        ).iter_rows(named=True)
+    }
+    # Present postcode: rollups are the component sums (1+2+3+4, 10×1).
+    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
+    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
+    # Missing postcode: rollups stay null rather than fabricating 0.0.
+    assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
+    assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
+
+
 def _property_candidates(rows: list[dict]) -> pl.DataFrame:
    base = {
        "postcode": "AA1 1AA",