try
Some checks failed
CI / Check (push) Failing after 3m22s
Build and publish Docker image / build-and-push (push) Successful in 7m25s

This commit is contained in:
Andras Schmelczer 2026-06-04 22:34:26 +01:00
parent 843d14b7ba
commit c938b71904
13 changed files with 698 additions and 109 deletions

View file

@ -13,6 +13,7 @@ from pipeline.transform.merge import (
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_best_listing_match,
_coalesce_direct_epc_columns,
_dedupe_collapsed_properties,
_filter_to_active_english_postcodes,
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_tree_density_is_area_level_and_survives_the_split() -> None:
# Street tree density is a postcode-centroid percentile (constant per
# postcode), so it must route to the postcode/area output -- not be stripped
# by _area_columns_from -- and must NOT be duplicated into the property
# output. Regression for the drift where it landed only in properties.parquet
# and was lost for the ~308k property-less postcodes.
assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Last known price": [250_000],
TREE_DENSITY_FEATURE: [42.0],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
TREE_DENSITY_FEATURE: [42.0, 7.0],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert TREE_DENSITY_FEATURE in postcode_df.columns
assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
assert TREE_DENSITY_FEATURE not in properties_df.columns
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
assert seed["was_council_house"].to_list() == ["No"]
def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
tmp_path,
) -> None:
# When BOTH the listing room count and a direct-EPC count exist, the EPC
# value must win: the scraped "Number of bedrooms & living rooms" is actually
# bedrooms + bathrooms (upstream defect), so preferring it would inflate the
# count. This pins the coalesce direction (direct-EPC before listing).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
# The corrupt listing room count (beds + baths).
pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
# The genuine EPC habitable-room count.
pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"number_habitable_rooms": pl.Int16,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["number_habitable_rooms"].to_list() == [3]
_DIRECT_EPC_CANDIDATE_SCHEMA = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
None
):
# Regression: a number-less listing (street/locality only) must NOT match a
# numbered property. The number gate is unconditional (like fuzzy_join), and
# the score is token_sort_ratio only, so a single locality token can no
# longer subset-inflate to 100 against a long numbered address.
candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
result = _best_listing_match(
listing_uprn=None,
query="KINGSWOOD",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
# A number-less listing CAN still match a number-less (named-house) property
# when the street/name matches almost exactly.
candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="WOODLANDS HOUSE OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, method, field = result
assert method == "address"
assert score >= 90.0
def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
None
):
# No regression for numbered listings: the number gate still permits a
# compatible house number and the lower with-numbers threshold applies.
candidates = [{"pp_address": "10 OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
_candidate, score, method, _field = result
assert method == "address"
assert score >= 82.0
def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
None
):
# A scraped numbered listing often appends town/county tokens that the bare
# Price-Paid register address omits. token_sort alone would score this ~73
# (below 82) and drop a correct match; token_set (allowed for numbered
# queries, where the number gate makes it safe) recovers it.
candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
result = _best_listing_match(
listing_uprn=None,
query="105 RIDGEWAY DRIVE BROMLEY KENT",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, _method, _field = result
assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
assert score >= 82.0
def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
None
):
# token_set for numbered queries is safe only because the number gate runs
# first: a query and candidate with incompatible house numbers never reach
# scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
assert finalized["Last known price"].to_list() == [500_000, 700_000]
# Listing's preferred floor area / rooms / property type / tenure.
# Listing's preferred floor area / property type / tenure.
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
# Rooms prefer the EPC habitable-room count over the listing's beds+baths
# value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
# EPC count so it falls back to the listing's 3.
assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.