try

2026-06-04 22:34:26 +01:00 · 2026-06-04 22:34:26 +01:00 · c938b71904
commit c938b71904
parent 843d14b7ba
13 changed files with 698 additions and 109 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -30,6 +30,7 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
 CONSERVATION_AREA_FEATURE = "Within conservation area"
+TREE_DENSITY_FEATURE = "Street tree density percentile"
 LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -92,6 +93,10 @@ _AREA_COLUMNS = [
    "Noise (dB)",
    "Max available download speed (Mbps)",
    CONSERVATION_AREA_FEATURE,
+    # Tree canopy is a 50m-radius percentile around the postcode centroid, so it
+    # is postcode-grain: it belongs in the area output (one value per postcode,
+    # covering property-less postcodes too) rather than duplicated per property.
+    TREE_DENSITY_FEATURE,
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
@ -116,7 +121,6 @@ _AREA_COLUMNS = [

 _DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
 _DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
-TREE_DENSITY_FEATURE = "Street tree density percentile"
 _POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
    r"^Tree canopy density percentile within \d+m$"
 )
@ -818,9 +822,9 @@ def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
    untouched. pp_address is non-null here (join_epc_pp filters it), so the key
    never merges unrelated rows.
    """
-    return wide.sort(
-        "date_of_transfer", descending=True, nulls_last=True
-    ).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
+    return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
+        subset=["postcode", "pp_address"], keep="first", maintain_order=True
+    )


 def _filter_to_active_english_postcodes(
@ -1108,13 +1112,26 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
    return epc_band_to_year(pl.col(column))


-def _address_score(query: str, candidate: str | None) -> int:
+def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
    if not candidate:
        return 0
-    return max(
-        fuzz.token_set_ratio(query, candidate),
-        fuzz.token_sort_ratio(query, candidate),
-    )
+    # token_set_ratio returns 100 whenever the shorter token set is a subset of
+    # the longer. For a NUMBER-LESS query that is unsafe — a single locality
+    # token (e.g. "KINGSWOOD") subsets to 100 against any long address that
+    # merely contains it — so number-less queries score with token_sort_ratio
+    # only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
+    # query the unconditional _numbers_compatible gate has already guaranteed the
+    # candidate carries compatible house numbers, so token_set cannot inflate
+    # across different addresses; allowing it recovers genuine matches where the
+    # scraped listing appends trailing town/county tokens the bare register
+    # address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
+    # DRIVE").
+    if allow_token_set:
+        return max(
+            fuzz.token_set_ratio(query, candidate),
+            fuzz.token_sort_ratio(query, candidate),
+        )
+    return fuzz.token_sort_ratio(query, candidate)


 def _has_number(address: str | None) -> bool:
@ -1153,9 +1170,12 @@ def _best_listing_match(
    ``uprn_index`` (postcode-independent, so it is robust even when the
    listing's postcode is slightly off); (2) failing that, the highest
    fuzzy street-address similarity within the listing's own postcode bucket.
-    No property-attribute heuristics are used — a house number in the listing
-    address gates the fuzzy match (`_numbers_compatible`) and lowers the score
-    threshold; a number-less address must match the street almost exactly.
+    No property-attribute heuristics are used — `_numbers_compatible` gates
+    every fuzzy match unconditionally (so a number-less listing can never match
+    a numbered property, and vice versa), as in the canonical
+    `fuzzy_join._score_bucket`. A house number additionally lowers the score
+    threshold and (via `_address_score`) permits token_set scoring; a number-less
+    address scores on token_sort only and must match the street almost exactly.

    ``addressed_fields`` names the candidate columns to fuzzy-match against (a
    candidate may carry both a register and an EPC address). Returns
@ -1180,9 +1200,11 @@ def _best_listing_match(
            address = candidate.get(field)
            if not address:
                continue
-            if listing_has_numbers and not _numbers_compatible(query, address):
+            # Unconditional number gate (matches fuzzy_join): a number-less
+            # listing cannot match a numbered candidate and vice versa.
+            if not _numbers_compatible(query, address):
                continue
-            score = _address_score(query, address)
+            score = _address_score(query, address, allow_token_set=listing_has_numbers)
            if score > best_score:
                best_score = score
                best = candidate
@ -1675,7 +1697,9 @@ def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
        # "Yes". "Former council house" should fire if EITHER side says so.
        if raw_column == "was_council_house":
            return (
-                pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
+                pl.when(
+                    (pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
+                )
                .then(pl.lit("Yes"))
                .otherwise(coalesce)
                .alias(raw_column)
@ -1716,9 +1740,13 @@ def _build_unmatched_listing_seed_rows(
        "total_floor_area": pl.coalesce(
            pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
        ),
+        # Prefer the direct-EPC habitable-room count over the listing's value:
+        # the scraped room count is bedrooms + bathrooms (upstream storage.py
+        # defect), so it over-counts. Fall back to the listing value only when
+        # the direct-EPC match has no count.
        "number_habitable_rooms": pl.coalesce(
-            pl.col("_actual_number_habitable_rooms"),
            pl.col("_direct_number_habitable_rooms"),
+            pl.col("_actual_number_habitable_rooms"),
        ),
        "latest_price": pl.col("_actual_asking_price"),
    }
@ -1836,14 +1864,19 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
        # Listing coordinates win over the postcode centroid.
        pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
        pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
-        # Listing's floor area / rooms override any EPC/PP value when present.
+        # Listing's floor area overrides any EPC/PP value when present.
        pl.coalesce(
            pl.col("_actual_total_floor_area").cast(pl.Float64),
            pl.col("Total floor area (sqm)"),
        ).alias("Total floor area (sqm)"),
+        # Rooms: prefer the EPC habitable-room count and fall back to the listing
+        # value only when no EPC count exists. The scraped "Number of bedrooms &
+        # living rooms" is actually bedrooms + bathrooms (an upstream storage.py
+        # defect), so preferring it would inflate the room count and overwrite a
+        # correct EPC value.
        pl.coalesce(
-            pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
            pl.col("Number of bedrooms & living rooms"),
+            pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
        ).alias("Number of bedrooms & living rooms"),
        pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
        .then(pl.col("_actual_property_type"))
@ -2130,12 +2163,15 @@ def _build(
        pl.when(
            (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
            & (
-                (pl.col("latest_price") / pl.col("total_floor_area"))
-                .is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
+                (pl.col("latest_price") / pl.col("total_floor_area")).is_between(
+                    MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
+                )
            )
        )
        .then(
-            (pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
+            (pl.col("latest_price") / pl.col("total_floor_area"))
+            .round(0)
+            .cast(pl.Int32)
        )
        .otherwise(None)
        .alias("Price per sqm"),