try
This commit is contained in:
parent
843d14b7ba
commit
c938b71904
13 changed files with 698 additions and 109 deletions
|
|
@ -13,6 +13,7 @@ from pipeline.transform.merge import (
|
|||
_active_english_postcode_area,
|
||||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_best_listing_match,
|
||||
_coalesce_direct_epc_columns,
|
||||
_dedupe_collapsed_properties,
|
||||
_filter_to_active_english_postcodes,
|
||||
|
|
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
|
|||
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_tree_density_is_area_level_and_survives_the_split() -> None:
|
||||
# Street tree density is a postcode-centroid percentile (constant per
|
||||
# postcode), so it must route to the postcode/area output -- not be stripped
|
||||
# by _area_columns_from -- and must NOT be duplicated into the property
|
||||
# output. Regression for the drift where it landed only in properties.parquet
|
||||
# and was lost for the ~308k property-less postcodes.
|
||||
assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"Last known price": [250_000],
|
||||
TREE_DENSITY_FEATURE: [42.0],
|
||||
}
|
||||
)
|
||||
postcode_features = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"lat": [51.0, 52.0],
|
||||
"lon": [-0.1, -0.2],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
TREE_DENSITY_FEATURE: [42.0, 7.0],
|
||||
}
|
||||
)
|
||||
|
||||
postcode_df, properties_df = _split_normal_outputs(
|
||||
df, postcode_features, expected_postcode_count=2
|
||||
)
|
||||
|
||||
assert TREE_DENSITY_FEATURE in postcode_df.columns
|
||||
assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
|
||||
assert TREE_DENSITY_FEATURE not in properties_df.columns
|
||||
|
||||
|
||||
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
|
||||
# Crime is now a raw spatial count per postcode; the per-1k-residents
|
||||
# variants were dropped along with the LSOA population denominator.
|
||||
|
|
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
|
|||
assert seed["was_council_house"].to_list() == ["No"]
|
||||
|
||||
|
||||
def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
# When BOTH the listing room count and a direct-EPC count exist, the EPC
|
||||
# value must win: the scraped "Number of bedrooms & living rooms" is actually
|
||||
# bedrooms + bathrooms (upstream defect), so preferring it would inflate the
|
||||
# count. This pins the coalesce direction (direct-EPC before listing).
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
# The corrupt listing room count (beds + baths).
|
||||
pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
|
||||
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
|
||||
# The genuine EPC habitable-room count.
|
||||
pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
|
||||
)
|
||||
template_schema = pl.Schema(
|
||||
{
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
"number_habitable_rooms": pl.Int16,
|
||||
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
|
||||
}
|
||||
)
|
||||
|
||||
seed = _build_unmatched_listing_seed_rows(
|
||||
listings.select("_listing_idx"), listings, template_schema
|
||||
)
|
||||
|
||||
assert seed["number_habitable_rooms"].to_list() == [3]
|
||||
|
||||
|
||||
_DIRECT_EPC_CANDIDATE_SCHEMA = {
|
||||
"_direct_epc_row": pl.UInt32,
|
||||
"_direct_epc_match_address": pl.Utf8,
|
||||
|
|
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
|
|||
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||
|
||||
|
||||
def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
|
||||
None
|
||||
):
|
||||
# Regression: a number-less listing (street/locality only) must NOT match a
|
||||
# numbered property. The number gate is unconditional (like fuzzy_join), and
|
||||
# the score is token_sort_ratio only, so a single locality token can no
|
||||
# longer subset-inflate to 100 against a long numbered address.
|
||||
candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="KINGSWOOD",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
|
||||
# A number-less listing CAN still match a number-less (named-house) property
|
||||
# when the street/name matches almost exactly.
|
||||
candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="WOODLANDS HOUSE OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
candidate, score, method, field = result
|
||||
assert method == "address"
|
||||
assert score >= 90.0
|
||||
|
||||
|
||||
def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
|
||||
None
|
||||
):
|
||||
# No regression for numbered listings: the number gate still permits a
|
||||
# compatible house number and the lower with-numbers threshold applies.
|
||||
candidates = [{"pp_address": "10 OAK LANE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="10 OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
_candidate, score, method, _field = result
|
||||
assert method == "address"
|
||||
assert score >= 82.0
|
||||
|
||||
|
||||
def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
|
||||
None
|
||||
):
|
||||
# A scraped numbered listing often appends town/county tokens that the bare
|
||||
# Price-Paid register address omits. token_sort alone would score this ~73
|
||||
# (below 82) and drop a correct match; token_set (allowed for numbered
|
||||
# queries, where the number gate makes it safe) recovers it.
|
||||
candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="105 RIDGEWAY DRIVE BROMLEY KENT",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
candidate, score, _method, _field = result
|
||||
assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
|
||||
assert score >= 82.0
|
||||
|
||||
|
||||
def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
|
||||
None
|
||||
):
|
||||
# token_set for numbered queries is safe only because the number gate runs
|
||||
# first: a query and candidate with incompatible house numbers never reach
|
||||
# scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
|
||||
candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="10 OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
|
||||
None
|
||||
):
|
||||
|
|
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
|
||||
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
|
||||
assert finalized["Last known price"].to_list() == [500_000, 700_000]
|
||||
# Listing's preferred floor area / rooms / property type / tenure.
|
||||
# Listing's preferred floor area / property type / tenure.
|
||||
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
|
||||
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
|
||||
# Rooms prefer the EPC habitable-room count over the listing's beds+baths
|
||||
# value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
|
||||
# EPC count so it falls back to the listing's 3.
|
||||
assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
|
||||
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
|
||||
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
|
||||
# Postcode-level feature carried through to both matched and unmatched rows.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue