idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -13,7 +13,12 @@ from pipeline.local_temp import local_tmp_dir

 _NUMBER_RE = re.compile(r"\d+")
 _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
-MIN_FUZZY_SCORE = 60
+# A house number is a strong disambiguator, so a numbered, number-compatible
+# pair may match on a lower address-similarity score than a number-less one
+# (named houses / flats by building name), which must match almost exactly to
+# be trusted.  Mirrors merge.py's listings convention.
+MIN_FUZZY_SCORE = 82
+MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90


 def normalize_address_key(s: pl.Expr) -> pl.Expr:
@ -47,6 +52,7 @@ def fuzzy_join_on_postcode(
    left_postcode_col: str,
    right_postcode_col: str,
    min_score: int = MIN_FUZZY_SCORE,
+    min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
 ) -> pl.LazyFrame:
    """Fuzzy join two LazyFrames by matching addresses within postcode buckets.

@ -120,7 +126,12 @@ def fuzzy_join_on_postcode(

        # Build tasks for each postcode bucket
        tasks = [
-            (left_entries, right_by_postcode[postcode], min_score)
+            (
+                left_entries,
+                right_by_postcode[postcode],
+                min_score,
+                min_score_without_numbers,
+            )
            for postcode, left_entries in left_by_postcode.items()
            if postcode in right_by_postcode
        ]
@ -201,16 +212,23 @@ def _numbers_compatible(a: str, b: str) -> bool:


 def _score_bucket(
-    args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
+    args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
 ) -> list[tuple[int, int, int]]:
    """Score all address pairs within a single postcode bucket."""
-    left_entries, right_entries, min_score = args
+    left_entries, right_entries, min_score, min_score_without_numbers = args
    pairs = []
    for left_row, left_address in left_entries:
        for right_row, right_address in right_entries:
            if not _numbers_compatible(left_address, right_address):
                continue
            score = fuzz.token_sort_ratio(left_address, right_address)
-            if score >= min_score:
+            # Number-less pairs (named houses, building-name flats) lack the
+            # house-number disambiguator, so require a near-exact match.
+            threshold = (
+                min_score
+                if _NUMBER_RE.search(left_address) or _NUMBER_RE.search(right_address)
+                else min_score_without_numbers
+            )
+            if score >= threshold:
                pairs.append((score, left_row, right_row))
    return pairs
--- a/pipeline/utils/postcode_mapping.py
+++ b/pipeline/utils/postcode_mapping.py
@ -6,6 +6,16 @@ import numpy as np
 import polars as pl
 from scipy.spatial import cKDTree

+# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
+# nearest active successor to be remapped. Beyond this we treat the postcode as having no
+# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
+# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
+# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
+# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
+# misattributions; dropped postcodes keep their terminated code and fall out at the
+# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
+MAX_REMAP_DISTANCE_M = 1000.0
+

 def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
    """Build a mapping from terminated England postcodes to their nearest active postcode.
@ -50,18 +60,30 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
    )

    tree = cKDTree(active_coords)
-    distances, indices = tree.query(terminated_coords)
+    distances, indices = tree.query(
+        terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
+    )
+
+    # cKDTree returns distance=inf and index==len(active) for points with no neighbour
+    # within the bound. Drop those terminated postcodes rather than gather an out-of-range
+    # index; they keep their terminated code and fall out at the active-postcode filter.
+    within_bound = np.isfinite(distances)
+    dropped = int((~within_bound).sum())

    active_postcodes = active["pcds"]
    mapping = pl.DataFrame(
        {
-            "old_postcode": terminated["pcds"],
-            "new_postcode": active_postcodes.gather(indices),
+            "old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
+            "new_postcode": active_postcodes.gather(indices[within_bound]),
        }
    )

+    kept_distances = distances[within_bound]
    print(
-        f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
+        f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
+        f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
+        if kept_distances.size
+        else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
    )

    return mapping
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -134,6 +134,91 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
    ]


+def test_fuzzy_join_rejects_mid_score_number_less_match():
+    # "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
+    # above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
+    # the number-less threshold of 90, so it must NOT match now.
+    left = pl.LazyFrame(
+        {
+            "left_address": ["The Coach House"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["The Old Coach House"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == [None]
+
+
+def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
+    # "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
+    # house number is compatible, so the numbered baseline (>= 82) still matches.
+    left = pl.LazyFrame(
+        {
+            "left_address": ["10 Acacia Avenue"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["Flat A, 10 Acacia Avenue"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
+
+
+def test_fuzzy_join_matches_high_score_number_less_pair():
+    # A number-less pair that clears the 90 threshold (here an exact token match,
+    # score 100) must still match.
+    left = pl.LazyFrame(
+        {
+            "left_address": ["The Old Rectory"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["THE OLD RECTORY"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == ["THE OLD RECTORY"]
+
+
 def test_normalize_postcode_key_requires_full_postcode():
    df = pl.DataFrame(
        {