idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -14,6 +14,7 @@ from pipeline.transform.merge import (
|
|||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_coalesce_direct_epc_columns,
|
||||
_dedupe_collapsed_properties,
|
||||
_filter_to_active_english_postcodes,
|
||||
_join_area_side_tables,
|
||||
_finalize_listings,
|
||||
|
|
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
|
|||
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
|
||||
|
||||
|
||||
def test_postcode_feature_validation_rejects_wrong_count() -> None:
|
||||
# The universe-size invariant: the postcode feature output must contain
|
||||
# EXACTLY the active-England universe. Too few rows (silently dropped
|
||||
# postcodes) and too many / duplicated rows (a join fan-out) must both fail,
|
||||
# so neither a truncated build nor a one-to-many join can ship.
|
||||
too_few = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"lat": [51.0],
|
||||
"lon": [-0.1],
|
||||
"ctry25cd": ["E92000001"],
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(too_few, expected_postcode_count=2)
|
||||
|
||||
too_many = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
|
||||
"lat": [51.0, 52.0, 53.0],
|
||||
"lon": [-0.1, -0.2, -0.3],
|
||||
"ctry25cd": ["E92000001"] * 3,
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(too_many, expected_postcode_count=2)
|
||||
|
||||
# Right row count but a duplicated key (n_unique < height) -- the signature of
|
||||
# a join fan-out.
|
||||
duplicated = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
"lat": [51.0, 51.0],
|
||||
"lon": [-0.1, -0.1],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(duplicated, expected_postcode_count=2)
|
||||
|
||||
|
||||
def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
|
||||
# Soundness: with side tables unique on their join key, the per-postcode
|
||||
# feature joins emit exactly one row per postcode (no fan-out). A fan-out here
|
||||
# would inflate the postcode universe above the active-England count -- the
|
||||
# failure the universe assertion above is the backstop for.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"Serious crime (avg/yr)": [1.0, 2.0],
|
||||
"Minor crime (avg/yr)": [3.0, 4.0],
|
||||
}
|
||||
)
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
|
||||
}
|
||||
),
|
||||
).collect()
|
||||
|
||||
# One row per postcode in -> one row out; the universe is not inflated.
|
||||
assert joined.height == 2
|
||||
assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
|
||||
|
||||
|
||||
def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
|
||||
# Broadband comes straight from Ofcom's CSV, so its postcode can drift in
|
||||
# spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
|
||||
# to the same canonical form so a real postcode populates
|
||||
# `max_download_speed` instead of silently missing the left join.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
|
||||
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"Serious crime (avg/yr)": [1.0, 2.0],
|
||||
"Minor crime (avg/yr)": [3.0, 4.0],
|
||||
}
|
||||
)
|
||||
# AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
|
||||
# raw spellings that canonicalize to one key (the max speed must win, with
|
||||
# no fan-out of the base row).
|
||||
broadband = pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
|
||||
"max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
|
||||
}
|
||||
)
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=broadband,
|
||||
).collect()
|
||||
|
||||
# No fan-out: still one row per base postcode.
|
||||
assert joined.height == 2
|
||||
speeds = dict(
|
||||
zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
|
||||
)
|
||||
# Spacing/casing drift still joins.
|
||||
assert speeds["AB1 2CD"] == 300
|
||||
# Two raw spellings collapse to one canonical key; the max wins.
|
||||
assert speeds["EF3 4GH"] == 1000
|
||||
# The temporary canonical join key is not leaked into the output schema.
|
||||
assert "_base_canonical_postcode" not in joined.columns
|
||||
assert "_bb_canonical_postcode" not in joined.columns
|
||||
assert "bb_postcode" not in joined.columns
|
||||
|
||||
|
||||
def test_listed_building_feature_is_property_level() -> None:
|
||||
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
||||
|
||||
|
|
@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
|
|||
|
||||
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
||||
# The crime table is LEFT-joined per postcode; a postcode absent from it
|
||||
# must NOT be fabricated as "zero crime" (the safest value). When every
|
||||
# per-type column is null the Serious/Minor rollups must stay null.
|
||||
# must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
|
||||
# rollups are precomputed in crime_spatial (the mean of the by-year rollup
|
||||
# bars), so the merge reads them straight through; a missing postcode leaves
|
||||
# them null.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
|
|
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
|
||||
# rollup headlines are precomputed values (deliberately NOT the per-type sum,
|
||||
# which would be 10.0 each) so this test proves the merge consumes the
|
||||
# precomputed column rather than re-summing per-type columns.
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA"],
|
||||
|
|
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
"Public order (avg/yr)": [1.0],
|
||||
"Drugs (avg/yr)": [1.0],
|
||||
"Other crime (avg/yr)": [1.0],
|
||||
"Serious crime (avg/yr)": [7.5],
|
||||
"Minor crime (avg/yr)": [4.2],
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
|
||||
broadband=pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
|
||||
}
|
||||
),
|
||||
).collect()
|
||||
|
||||
by_postcode = {
|
||||
|
|
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
|
||||
).iter_rows(named=True)
|
||||
}
|
||||
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
|
||||
# Present postcode: rollups are the precomputed headline values, read through
|
||||
# unchanged (NOT the per-type sum of 10.0).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
|
||||
# Missing postcode: rollups stay null rather than fabricating 0.0.
|
||||
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
|
||||
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
|
||||
|
||||
|
||||
def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
|
||||
# The terminated-postcode remap can merge two distinct postcodes onto one
|
||||
# active successor, collapsing the same physical address onto a single
|
||||
# (postcode, pp_address) key with conflicting sale records. The dedup must
|
||||
# keep exactly one row per (postcode, pp_address) -- the most recent
|
||||
# transaction -- and must not collapse genuinely distinct addresses.
|
||||
from datetime import datetime
|
||||
|
||||
wide = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
|
||||
"pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
|
||||
"date_of_transfer": [
|
||||
datetime(1990, 1, 1),
|
||||
datetime(2015, 6, 1),
|
||||
datetime(2000, 1, 1),
|
||||
],
|
||||
"latest_price": [1_587_700, 4_500_000, 250_000],
|
||||
}
|
||||
)
|
||||
|
||||
out = _dedupe_collapsed_properties(wide).collect()
|
||||
|
||||
# One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
|
||||
assert out.height == 2
|
||||
assert out.select(["postcode", "pp_address"]).is_unique().all()
|
||||
by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
|
||||
# The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
|
||||
# not an arbitrary one.
|
||||
assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
|
||||
assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
|
||||
# A genuinely distinct address in the same postcode is untouched.
|
||||
assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
|
||||
|
||||
|
||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"postcode": "AA1 1AA",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue