SPlit up

2026-06-12 21:51:37 +01:00 · 2026-06-12 21:51:37 +01:00 · f59d01227b
commit f59d01227b
parent cf39ad754e
91 changed files with 10370 additions and 7562 deletions
--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -1,6 +1,8 @@
 import re
 import shutil
 import tempfile
+from collections import Counter
+from collections.abc import Sequence
 from concurrent.futures import ProcessPoolExecutor
 from os import cpu_count
 from pathlib import Path
@ -10,6 +12,7 @@ from thefuzz import fuzz
 from tqdm import tqdm

 from pipeline.local_temp import local_tmp_dir
+from pipeline.utils.normalize import uppercase_alnum_key_expr

 # A house-number token includes any letter suffix: 8A, 8B and plain 8 are
 # three different properties on the same street, so digit-only extraction
@ -17,6 +20,10 @@ from pipeline.local_temp import local_tmp_dir
 # through normalize_address_key first, so tokens are uppercase and
 # space-separated and [A-Z] suffices for the suffix.
 _NUMBER_RE = re.compile(r"\d+[A-Z]?")
+# A single-letter flat designator ("FLAT B", "APARTMENT C") is a house-number-
+# grade disambiguator with no digit in it: without this, FLAT B and FLAT D in
+# the same building scored ~96 and cross-matched.
+_FLAT_LETTER_RE = re.compile(r"\b(?:FLAT|APARTMENT|APT|UNIT) ([A-Z])\b")
 _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
 # A house number is a strong disambiguator, so a numbered, number-compatible
 # pair may match on a lower address-similarity score than a number-less one
@ -24,16 +31,30 @@ _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
 # be trusted.  Mirrors merge.py's listings convention.
 MIN_FUZZY_SCORE = 82
 MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
+# A score reached only through an address VARIANT (locality appended /
+# secondary address lines dropped) accepts a match the primary strings alone
+# would reject, so it must clear a near-exact bar: in the miss audit >99% of
+# genuine variant recoveries scored 100, while the rare false variant matches
+# scored in the 80s.
+MIN_VARIANT_SCORE = 90
+
+# Tokens that mark a sub-unit of a building. A variant whose added/dropped
+# tokens include one of these could score a single flat's certificate as if it
+# were the whole building, so such variants are inadmissible.
+_FLAT_TOKENS = {
+    "FLAT",
+    "FLATS",
+    "APARTMENT",
+    "APT",
+    "UNIT",
+    "MAISONETTE",
+    "STUDIO",
+    "ROOM",
+}


 def normalize_address_key(s: pl.Expr) -> pl.Expr:
-    normalized = (
-        s.cast(pl.String)
-        .str.to_uppercase()
-        .str.replace_all(r"[^0-9A-Z]+", " ")
-        .str.replace_all(r"\s+", " ")
-        .str.strip_chars()
-    )
+    normalized = uppercase_alnum_key_expr(s)
    return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)


@ -58,6 +79,8 @@ def fuzzy_join_on_postcode(
    right_postcode_col: str,
    min_score: int = MIN_FUZZY_SCORE,
    min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
+    left_variant_cols: Sequence[str] = (),
+    right_variant_cols: Sequence[str] = (),
 ) -> pl.LazyFrame:
    """Fuzzy join two LazyFrames by matching addresses within postcode buckets.

@ -66,6 +89,19 @@ def fuzzy_join_on_postcode(
    columns (index, address, postcode) via projection pushdown, and the
    final join reads the remaining columns lazily.

+    ``left_variant_cols`` / ``right_variant_cols`` name alternative address
+    columns for the same property (e.g. the EPC's first address line without
+    its locality suffix, or the price-paid address with its locality
+    appended). A pair is scored as the best token_sort_ratio over all
+    admissible variant combinations: source registers frequently disagree
+    only on a trailing village/locality token, which alone drags short
+    addresses below the match threshold. The number-compatibility gate is
+    always evaluated on the primary addresses, and `_admissible_variants`
+    rejects any variant whose added/dropped tokens carry digits or flat
+    designators, so a variant can never bypass the gate or score a single
+    flat as its whole building. Variant-only scores must clear
+    ``MIN_VARIANT_SCORE``.
+
    Returns a LazyFrame with all left and right columns, plus a
    ``_match_score`` (UInt8) audit column holding the token_sort_ratio of
    the accepted match (exact matches score 100).  Unmatched rows have null
@ -90,6 +126,10 @@ def fuzzy_join_on_postcode(
                normalize_postcode_key(pl.col(left_postcode_col)).alias(
                    "_left_postcode"
                ),
+                *(
+                    normalize_address_key(pl.col(col)).alias(f"_left_variant_{i}")
+                    for i, col in enumerate(left_variant_cols)
+                ),
            )
            .collect(engine="streaming")
        )
@ -104,30 +144,45 @@ def fuzzy_join_on_postcode(
                normalize_postcode_key(pl.col(right_postcode_col)).alias(
                    "_right_postcode"
                ),
+                *(
+                    normalize_address_key(pl.col(col)).alias(f"_right_variant_{i}")
+                    for i, col in enumerate(right_variant_cols)
+                ),
            )
            .unique(subset=["_right_address", "_right_postcode"], keep="first")
            .collect(engine="streaming")
        )

+        left_variant_names = [f"_left_variant_{i}" for i in range(len(left_variant_cols))]
+        right_variant_names = [
+            f"_right_variant_{i}" for i in range(len(right_variant_cols))
+        ]
+
        # Group right side by postcode for fast lookup
-        right_by_postcode: dict[str, list[tuple[int, str]]] = {}
-        for idx, postcode, address in zip(
+        right_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
+        for idx, postcode, address, *variants in zip(
            right_match["_right_idx"],
            right_match["_right_postcode"],
            right_match["_right_address"],
+            *(right_match[name] for name in right_variant_names),
        ):
            if address is not None and postcode is not None:
-                right_by_postcode.setdefault(postcode, []).append((idx, address))
+                right_by_postcode.setdefault(postcode, []).append(
+                    (idx, address, _admissible_variants(address, variants))
+                )

        # Group left side by postcode
-        left_by_postcode: dict[str, list[tuple[int, str]]] = {}
-        for idx, postcode, address in zip(
+        left_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
+        for idx, postcode, address, *variants in zip(
            left_match["_left_idx"],
            left_match["_left_postcode"],
            left_match["_left_address"],
+            *(left_match[name] for name in left_variant_names),
        ):
            if address is not None and postcode is not None:
-                left_by_postcode.setdefault(postcode, []).append((idx, address))
+                left_by_postcode.setdefault(postcode, []).append(
+                    (idx, address, _admissible_variants(address, variants))
+                )

        del left_match, right_match

@ -145,7 +200,12 @@ def fuzzy_join_on_postcode(

        # Score all pairwise matches in parallel, then greedily assign from
        # highest score downward so best pairs lock in first.
-        all_pairs: list[tuple[int, int, int]] = []  # (score, left_idx, right_idx)
+        # Pair tuples are (score, exact, left_idx, right_idx); `exact` marks a
+        # literally-equal primary pair so it wins greedy ties against a pair
+        # that merely token-sorts to the same score (e.g. "APARTMENT 3 1 HIGH
+        # ST" vs "APARTMENT 1 3 HIGH ST" both score 100 against each other's
+        # certificates, but each has a literal twin).
+        all_pairs: list[tuple[int, int, int, int]] = []
        with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
            for pairs in tqdm(
                executor.map(_score_bucket, tasks, chunksize=64),
@ -156,8 +216,9 @@ def fuzzy_join_on_postcode(

        del tasks, left_by_postcode, right_by_postcode

-        # Sort descending by score so best matches are assigned first
-        all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
+        # Sort so the best matches are assigned first: score, then literal
+        # equality, then stable left-index order.
+        all_pairs.sort(key=lambda t: (t[0], t[1], -t[2]), reverse=True)

        # Keep the score alongside each accepted pair: it is emitted as the
        # _match_score audit column so downstream consumers can distinguish
@ -166,7 +227,7 @@ def fuzzy_join_on_postcode(
        matched_left: set[int] = set()
        matched_right: set[int] = set()

-        for score, left_idx, right_idx in all_pairs:
+        for score, _exact, left_idx, right_idx in all_pairs:
            if left_idx in matched_left or right_idx in matched_right:
                continue
            matches.append((left_idx, right_idx, score))
@ -208,40 +269,102 @@ def fuzzy_join_on_postcode(
    return result.lazy()


+def _number_tokens(address: str) -> set[str]:
+    tokens = set(_NUMBER_RE.findall(address))
+    tokens.update(_FLAT_LETTER_RE.findall(address))
+    return tokens
+
+
 def _numbers_compatible(a: str, b: str) -> bool:
    """Check that the number tokens (house/flat numbers, including any letter
-    suffix) of two addresses are IDENTICAL sets.
+    suffix, plus single-letter flat designators) of two addresses are
+    IDENTICAL sets.

    Equality, not subset: subset logic let "188 GREAT NORTH WAY" absorb
    "FLAT 1 188 GREAT NORTH WAY" ({188} is a subset of {1, 188}), attaching a
    single flat's EPC facts to the whole building — tens of thousands of
    wrong-property matches. Likewise digit-only tokens made "8A" and "8B"
-    both look like {8} and match each other (and plain "8"). Precision over
-    recall: a pair whose two sources genuinely disagree on number tokens is
-    safer left unmatched.
+    both look like {8} and match each other (and plain "8"), and ungated
+    letter flats let "FLAT D 39 X ST" cross-match "FLAT F 39 X ST" at ~96.
+    Precision over recall: a pair whose two sources genuinely disagree on
+    number tokens is safer left unmatched.

    One side numbered, the other not -> incompatible. Neither numbered ->
    compatible; such pairs are scored against the stricter no-numbers
    threshold instead.
    """
-    nums_a = set(_NUMBER_RE.findall(a))
-    nums_b = set(_NUMBER_RE.findall(b))
+    nums_a = _number_tokens(a)
+    nums_b = _number_tokens(b)
    if not nums_a and not nums_b:
        return True
    return nums_a == nums_b


+def _admissible_variants(
+    primary: str, variants: Sequence[str | None]
+) -> tuple[str, ...]:
+    """Variants of ``primary`` that are safe to score against the other side.
+
+    A variant may only ADD or DROP whole tokens relative to the primary (one
+    word multiset must contain the other) — never substitute, so a register
+    row whose address lines disagree with the combined address can't smuggle
+    in a different street. The number gate runs on the primary addresses
+    only, so the added/dropped tokens must additionally carry no digits
+    (house numbers) and no flat designator (a "Flat 1"-style secondary line
+    dropped from an EPC address would otherwise let a single flat score as
+    the whole building). The remaining admissible difference is exactly the
+    harmless kind variants exist for: trailing locality/village/town words.
+    """
+    primary_words = Counter(primary.split())
+    admissible: list[str] = []
+    for variant in variants:
+        if not variant or variant == primary:
+            continue
+        variant_words = Counter(variant.split())
+        if not (variant_words <= primary_words or primary_words <= variant_words):
+            continue
+        changed = (primary_words - variant_words) + (variant_words - primary_words)
+        if any(
+            any(ch.isdigit() for ch in token) or token in _FLAT_TOKENS
+            for token in changed
+        ):
+            continue
+        admissible.append(variant)
+    return tuple(dict.fromkeys(admissible))
+
+
 def _score_bucket(
-    args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
-) -> list[tuple[int, int, int]]:
+    args: tuple[
+        list[tuple[int, str, tuple[str, ...]]],
+        list[tuple[int, str, tuple[str, ...]]],
+        int,
+        int,
+    ],
+) -> list[tuple[int, int, int, int]]:
    """Score all address pairs within a single postcode bucket."""
    left_entries, right_entries, min_score, min_score_without_numbers = args
    pairs = []
-    for left_row, left_address in left_entries:
-        for right_row, right_address in right_entries:
+    for left_row, left_address, left_variants in left_entries:
+        for right_row, right_address, right_variants in right_entries:
            if not _numbers_compatible(left_address, right_address):
                continue
            score = fuzz.token_sort_ratio(left_address, right_address)
+            # Variant pairs recover same-property matches where one register
+            # carries a locality suffix the other lacks; a variant-only score
+            # must clear the near-exact MIN_VARIANT_SCORE bar.
+            if score < 100 and (left_variants or right_variants):
+                for left_variant in (left_address, *left_variants):
+                    for right_variant in (right_address, *right_variants):
+                        if (
+                            left_variant is left_address
+                            and right_variant is right_address
+                        ):
+                            continue
+                        variant_score = fuzz.token_sort_ratio(
+                            left_variant, right_variant
+                        )
+                        if variant_score >= MIN_VARIANT_SCORE and variant_score > score:
+                            score = variant_score
            # Number-less pairs (named houses, building-name flats) lack the
            # house-number disambiguator, so require a near-exact match.
            threshold = (
@ -250,5 +373,7 @@ def _score_bucket(
                else min_score_without_numbers
            )
            if score >= threshold:
-                pairs.append((score, left_row, right_row))
+                pairs.append(
+                    (score, int(left_address == right_address), left_row, right_row)
+                )
    return pairs