idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -13,7 +13,9 @@ from pipeline.transform.merge import (
|
|||
_active_english_postcode_area,
|
||||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_coalesce_direct_epc_columns,
|
||||
_filter_to_active_english_postcodes,
|
||||
_join_area_side_tables,
|
||||
_finalize_listings,
|
||||
_integrate_listings,
|
||||
_match_direct_epc,
|
||||
|
|
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
|
|||
assert loaded["_actual_lat"].to_list() == [51.5]
|
||||
|
||||
|
||||
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
|
||||
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
|
||||
# the same digits-only key as `_normalize_uprn` on the candidate side, so
|
||||
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
|
||||
# and stripping non-digits would yield "1000233369560" (a bogus trailing
|
||||
# zero) which never collides with the candidate key "100023336956".
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
|
||||
loaded = _load_listings_for_merge(listings_path, arcgis_path)
|
||||
|
||||
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
|
||||
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
|
||||
|
||||
|
||||
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
|
|
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
|
|||
assert _normalize_uprn(float("nan")) is None
|
||||
|
||||
|
||||
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
|
||||
# The raw property value is fill_null("No") upstream, so a plain coalesce
|
||||
# would let a non-null "No" override a directly-matched listing "Yes".
|
||||
# "Former council house" should fire if EITHER side says "Yes".
|
||||
none_col = [None] * 5
|
||||
wide = pl.LazyFrame(
|
||||
{
|
||||
"was_council_house": ["No", "Yes", "No", None, None],
|
||||
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
|
||||
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
|
||||
"current_energy_rating": [None, "C", "D", None, None],
|
||||
"_direct_current_energy_rating": ["B", "A", None, "E", None],
|
||||
# _coalesce_direct_epc_columns coalesces every pair in
|
||||
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
|
||||
"epc_address": none_col,
|
||||
"_direct_epc_address": none_col,
|
||||
"potential_energy_rating": none_col,
|
||||
"_direct_potential_energy_rating": none_col,
|
||||
"total_floor_area": none_col,
|
||||
"_direct_total_floor_area": none_col,
|
||||
"number_habitable_rooms": none_col,
|
||||
"_direct_number_habitable_rooms": none_col,
|
||||
"floor_height": none_col,
|
||||
"_direct_floor_height": none_col,
|
||||
"construction_age_band": none_col,
|
||||
"_direct_construction_age_band": none_col,
|
||||
"is_construction_date_approximate": none_col,
|
||||
"_direct_is_construction_date_approximate": none_col,
|
||||
}
|
||||
)
|
||||
|
||||
result = _coalesce_direct_epc_columns(wide).collect()
|
||||
|
||||
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
|
||||
# Plain coalesce (raw wins when non-null) is untouched for other columns.
|
||||
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
|
||||
|
||||
|
||||
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
||||
# The crime table is LEFT-joined per postcode; a postcode absent from it
|
||||
# must NOT be fabricated as "zero crime" (the safest value). When every
|
||||
# per-type column is null the Serious/Minor rollups must stay null.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA"],
|
||||
"Violence and sexual offences (avg/yr)": [1.0],
|
||||
"Robbery (avg/yr)": [2.0],
|
||||
"Burglary (avg/yr)": [3.0],
|
||||
"Possession of weapons (avg/yr)": [4.0],
|
||||
"Anti-social behaviour (avg/yr)": [1.0],
|
||||
"Criminal damage and arson (avg/yr)": [1.0],
|
||||
"Shoplifting (avg/yr)": [1.0],
|
||||
"Bicycle theft (avg/yr)": [1.0],
|
||||
"Theft from the person (avg/yr)": [1.0],
|
||||
"Other theft (avg/yr)": [1.0],
|
||||
"Vehicle crime (avg/yr)": [1.0],
|
||||
"Public order (avg/yr)": [1.0],
|
||||
"Drugs (avg/yr)": [1.0],
|
||||
"Other crime (avg/yr)": [1.0],
|
||||
}
|
||||
)
|
||||
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
|
||||
).collect()
|
||||
|
||||
by_postcode = {
|
||||
row["postcode"]: row
|
||||
for row in joined.select(
|
||||
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
|
||||
).iter_rows(named=True)
|
||||
}
|
||||
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
|
||||
# Missing postcode: rollups stay null rather than fabricating 0.0.
|
||||
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
|
||||
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
|
||||
|
||||
|
||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"postcode": "AA1 1AA",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue