This commit is contained in:
Andras Schmelczer 2026-05-31 20:20:41 +01:00
parent 8688b7475e
commit e8345cbdc1
40 changed files with 1980 additions and 904 deletions

View file

@ -10,8 +10,10 @@ from pipeline.transform.merge import (
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_LISTING_OVERLAY_SOURCES,
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_filter_to_active_english_postcodes,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
@ -24,8 +26,11 @@ from pipeline.transform.merge import (
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
_remap_terminated_postcodes,
_split_normal_outputs,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_postcode_feature_output,
_validate_property_postcodes,
)
@ -79,6 +84,113 @@ def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_active_english_postcode_area_filters_to_active_england() -> None:
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
"lat": [51.0, 51.1, 52.0],
"long": [-0.1, -0.2, -3.0],
"lsoa21cd": ["L1", "L2", "L3"],
"oa21cd": ["O1", "O2", "O3"],
"pcon24cd": ["P1", "P2", "P3"],
}
)
result = _active_english_postcode_area(arcgis.lazy()).collect()
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"lat": 51.0,
"lon": -0.1,
"ctry25cd": "E92000001",
"lsoa21": "L1",
"oa21": "O1",
"pcon": "P1",
}
]
def test_remap_then_active_filter_keeps_terminated_english_properties() -> None:
wide = pl.DataFrame(
{
"postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"],
"row_id": [1, 2, 3],
}
).lazy()
mapping = pl.DataFrame(
{"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]}
).lazy()
active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy()
result = (
_filter_to_active_english_postcodes(
_remap_terminated_postcodes(wide, mapping), active_postcodes
)
.collect()
.sort("row_id")
)
assert result.to_dicts() == [
{"postcode": "NEW 1AA", "row_id": 1},
{"postcode": "NEW 1AA", "row_id": 2},
]
def test_split_normal_outputs_uses_postcode_feature_universe() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Address per Property Register": ["1 Example Road"],
"Last known price": [250_000],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
"lsoa21": ["L1"],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
"lsoa21": ["L1", "L2"],
"Distance to nearest amenity (Park) (km)": [0.3, 0.8],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"]
assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns
assert properties_df.to_dicts() == [
{
"Postcode": "AA1 1AA",
"Address per Property Register": "1 Example Road",
"Last known price": 250_000,
}
]
def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None:
postcode_df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "CF1 1AA"],
"lat": [51.0, None],
"lon": [-0.1, None],
"ctry25cd": ["E92000001", "W92000004"],
}
)
with pytest.raises(ValueError, match="unsupported or ungeocoded"):
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -418,9 +530,7 @@ def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
)
unmatched_idxs = listings.select("_listing_idx")
seed = _build_unmatched_listing_seed_rows(
unmatched_idxs, listings, template_schema
)
seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema)
assert seed.height == 1
assert seed["postcode"].to_list() == ["SW1A 1AA"]
@ -550,7 +660,12 @@ def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
[
{
"_direct_epc_uprn": "100000000001",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)