try

2026-06-04 22:34:26 +01:00 · 2026-06-04 22:34:26 +01:00 · c938b71904
commit c938b71904
parent 843d14b7ba
13 changed files with 698 additions and 109 deletions
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -13,6 +13,7 @@ from pipeline.transform.merge import (
    _active_english_postcode_area,
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
+    _best_listing_match,
    _coalesce_direct_epc_columns,
    _dedupe_collapsed_properties,
    _filter_to_active_english_postcodes,
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
    assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS


+def test_tree_density_is_area_level_and_survives_the_split() -> None:
+    # Street tree density is a postcode-centroid percentile (constant per
+    # postcode), so it must route to the postcode/area output -- not be stripped
+    # by _area_columns_from -- and must NOT be duplicated into the property
+    # output. Regression for the drift where it landed only in properties.parquet
+    # and was lost for the ~308k property-less postcodes.
+    assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
+
+    df = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA"],
+            "Last known price": [250_000],
+            TREE_DENSITY_FEATURE: [42.0],
+        }
+    )
+    postcode_features = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", "BB1 1BB"],
+            "lat": [51.0, 52.0],
+            "lon": [-0.1, -0.2],
+            "ctry25cd": ["E92000001", "E92000001"],
+            TREE_DENSITY_FEATURE: [42.0, 7.0],
+        }
+    )
+
+    postcode_df, properties_df = _split_normal_outputs(
+        df, postcode_features, expected_postcode_count=2
+    )
+
+    assert TREE_DENSITY_FEATURE in postcode_df.columns
+    assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
+    assert TREE_DENSITY_FEATURE not in properties_df.columns
+
+
 def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
    # Crime is now a raw spatial count per postcode; the per-1k-residents
    # variants were dropped along with the LSOA population denominator.
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
    assert seed["was_council_house"].to_list() == ["No"]


+def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
+    tmp_path,
+) -> None:
+    # When BOTH the listing room count and a direct-EPC count exist, the EPC
+    # value must win: the scraped "Number of bedrooms & living rooms" is actually
+    # bedrooms + bathrooms (upstream defect), so preferring it would inflate the
+    # count. This pins the coalesce direction (direct-EPC before listing).
+    listings_path = tmp_path / "listings.parquet"
+    arcgis_path = tmp_path / "arcgis.parquet"
+    _sample_listings_frame().with_columns(
+        # The corrupt listing room count (beds + baths).
+        pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
+    ).write_parquet(listings_path)
+    _stub_arcgis(arcgis_path)
+
+    listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
+        # The genuine EPC habitable-room count.
+        pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
+    )
+    template_schema = pl.Schema(
+        {
+            "postcode": pl.Utf8,
+            "pp_address": pl.Utf8,
+            "number_habitable_rooms": pl.Int16,
+            **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
+        }
+    )
+
+    seed = _build_unmatched_listing_seed_rows(
+        listings.select("_listing_idx"), listings, template_schema
+    )
+
+    assert seed["number_habitable_rooms"].to_list() == [3]
+
+
 _DIRECT_EPC_CANDIDATE_SCHEMA = {
    "_direct_epc_row": pl.UInt32,
    "_direct_epc_match_address": pl.Utf8,
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
    assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]


+def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
+    None
+):
+    # Regression: a number-less listing (street/locality only) must NOT match a
+    # numbered property. The number gate is unconditional (like fuzzy_join), and
+    # the score is token_sort_ratio only, so a single locality token can no
+    # longer subset-inflate to 100 against a long numbered address.
+    candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="KINGSWOOD",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is None
+
+
+def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
+    # A number-less listing CAN still match a number-less (named-house) property
+    # when the street/name matches almost exactly.
+    candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="WOODLANDS HOUSE OAK LANE",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is not None
+    candidate, score, method, field = result
+    assert method == "address"
+    assert score >= 90.0
+
+
+def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
+    None
+):
+    # No regression for numbered listings: the number gate still permits a
+    # compatible house number and the lower with-numbers threshold applies.
+    candidates = [{"pp_address": "10 OAK LANE"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="10 OAK LANE",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is not None
+    _candidate, score, method, _field = result
+    assert method == "address"
+    assert score >= 82.0
+
+
+def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
+    None
+):
+    # A scraped numbered listing often appends town/county tokens that the bare
+    # Price-Paid register address omits. token_sort alone would score this ~73
+    # (below 82) and drop a correct match; token_set (allowed for numbered
+    # queries, where the number gate makes it safe) recovers it.
+    candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="105 RIDGEWAY DRIVE BROMLEY KENT",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is not None
+    candidate, score, _method, _field = result
+    assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
+    assert score >= 82.0
+
+
+def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
+    None
+):
+    # token_set for numbered queries is safe only because the number gate runs
+    # first: a query and candidate with incompatible house numbers never reach
+    # scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
+    candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="10 OAK LANE",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is None
+
+
 def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
    None
 ):
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
    assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
    assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
    assert finalized["Last known price"].to_list() == [500_000, 700_000]
-    # Listing's preferred floor area / rooms / property type / tenure.
+    # Listing's preferred floor area / property type / tenure.
    assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
-    assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
+    # Rooms prefer the EPC habitable-room count over the listing's beds+baths
+    # value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
+    # EPC count so it falls back to the listing's 3.
+    assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
    assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
    assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
    # Postcode-level feature carried through to both matched and unmatched rows.