This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -13,7 +13,9 @@ from pipeline.transform.merge import (
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_coalesce_direct_epc_columns,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
assert loaded["_actual_lat"].to_list() == [51.5]
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
# the same digits-only key as `_normalize_uprn` on the candidate side, so
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
# and stripping non-digits would yield "1000233369560" (a bogus trailing
# zero) which never collides with the candidate key "100023336956".
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
tmp_path,
) -> None:
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(float("nan")) is None
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
# The raw property value is fill_null("No") upstream, so a plain coalesce
# would let a non-null "No" override a directly-matched listing "Yes".
# "Former council house" should fire if EITHER side says "Yes".
none_col = [None] * 5
wide = pl.LazyFrame(
{
"was_council_house": ["No", "Yes", "No", None, None],
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
"current_energy_rating": [None, "C", "D", None, None],
"_direct_current_energy_rating": ["B", "A", None, "E", None],
# _coalesce_direct_epc_columns coalesces every pair in
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
"epc_address": none_col,
"_direct_epc_address": none_col,
"potential_energy_rating": none_col,
"_direct_potential_energy_rating": none_col,
"total_floor_area": none_col,
"_direct_total_floor_area": none_col,
"number_habitable_rooms": none_col,
"_direct_number_habitable_rooms": none_col,
"floor_height": none_col,
"_direct_floor_height": none_col,
"construction_age_band": none_col,
"_direct_construction_age_band": none_col,
"is_construction_date_approximate": none_col,
"_direct_is_construction_date_approximate": none_col,
}
)
result = _coalesce_direct_epc_columns(wide).collect()
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
# Plain coalesce (raw wins when non-null) is untouched for other columns.
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). When every
# per-type column is null the Serious/Minor rollups must stay null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
"Violence and sexual offences (avg/yr)": [1.0],
"Robbery (avg/yr)": [2.0],
"Burglary (avg/yr)": [3.0],
"Possession of weapons (avg/yr)": [4.0],
"Anti-social behaviour (avg/yr)": [1.0],
"Criminal damage and arson (avg/yr)": [1.0],
"Shoplifting (avg/yr)": [1.0],
"Bicycle theft (avg/yr)": [1.0],
"Theft from the person (avg/yr)": [1.0],
"Other theft (avg/yr)": [1.0],
"Vehicle crime (avg/yr)": [1.0],
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
).collect()
by_postcode = {
row["postcode"]: row
for row in joined.select(
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",