This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -14,6 +14,7 @@ from pipeline.transform.merge import (
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_coalesce_direct_epc_columns,
_dedupe_collapsed_properties,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_postcode_feature_validation_rejects_wrong_count() -> None:
# The universe-size invariant: the postcode feature output must contain
# EXACTLY the active-England universe. Too few rows (silently dropped
# postcodes) and too many / duplicated rows (a join fan-out) must both fail,
# so neither a truncated build nor a one-to-many join can ship.
too_few = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_few, expected_postcode_count=2)
too_many = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [51.0, 52.0, 53.0],
"lon": [-0.1, -0.2, -0.3],
"ctry25cd": ["E92000001"] * 3,
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_many, expected_postcode_count=2)
# Right row count but a duplicated key (n_unique < height) -- the signature of
# a join fan-out.
duplicated = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "AA1 1AA"],
"lat": [51.0, 51.0],
"lon": [-0.1, -0.1],
"ctry25cd": ["E92000001", "E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(duplicated, expected_postcode_count=2)
def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
# Soundness: with side tables unique on their join key, the per-postcode
# feature joins emit exactly one row per postcode (no fan-out). A fan-out here
# would inflate the postcode universe above the active-England count -- the
# failure the universe assertion above is the backstop for.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
# One row per postcode in -> one row out; the universe is not inflated.
assert joined.height == 2
assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
# Broadband comes straight from Ofcom's CSV, so its postcode can drift in
# spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
# to the same canonical form so a real postcode populates
# `max_download_speed` instead of silently missing the left join.
base = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
# AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
# raw spellings that canonicalize to one key (the max speed must win, with
# no fan-out of the base row).
broadband = pl.LazyFrame(
{
"bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
"max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=broadband,
).collect()
# No fan-out: still one row per base postcode.
assert joined.height == 2
speeds = dict(
zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
)
# Spacing/casing drift still joins.
assert speeds["AB1 2CD"] == 300
# Two raw spellings collapse to one canonical key; the max wins.
assert speeds["EF3 4GH"] == 1000
# The temporary canonical join key is not leaked into the output schema.
assert "_base_canonical_postcode" not in joined.columns
assert "_bb_canonical_postcode" not in joined.columns
assert "bb_postcode" not in joined.columns
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). When every
# per-type column is null the Serious/Minor rollups must stay null.
# must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
# rollups are precomputed in crime_spatial (the mean of the by-year rollup
# bars), so the merge reads them straight through; a missing postcode leaves
# them null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
# rollup headlines are precomputed values (deliberately NOT the per-type sum,
# which would be 10.0 each) so this test proves the merge consumes the
# precomputed column rather than re-summing per-type columns.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
"Serious crime (avg/yr)": [7.5],
"Minor crime (avg/yr)": [4.2],
}
)
@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
by_postcode = {
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
# Present postcode: rollups are the precomputed headline values, read through
# unchanged (NOT the per-type sum of 10.0).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
# The terminated-postcode remap can merge two distinct postcodes onto one
# active successor, collapsing the same physical address onto a single
# (postcode, pp_address) key with conflicting sale records. The dedup must
# keep exactly one row per (postcode, pp_address) -- the most recent
# transaction -- and must not collapse genuinely distinct addresses.
from datetime import datetime
wide = pl.LazyFrame(
{
"postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
"pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
"date_of_transfer": [
datetime(1990, 1, 1),
datetime(2015, 6, 1),
datetime(2000, 1, 1),
],
"latest_price": [1_587_700, 4_500_000, 250_000],
}
)
out = _dedupe_collapsed_properties(wide).collect()
# One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
assert out.height == 2
assert out.select(["postcode", "pp_address"]).is_unique().all()
by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
# The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
# not an arbitrary one.
assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
# A genuinely distinct address in the same postcode is untouched.
assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",