Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -11,7 +11,12 @@ from tqdm import tqdm

 from pipeline.local_temp import local_tmp_dir

-_NUMBER_RE = re.compile(r"\d+")
+# A house-number token includes any letter suffix: 8A, 8B and plain 8 are
+# three different properties on the same street, so digit-only extraction
+# (which collapsed all three to "8") is not enough. Addresses are passed
+# through normalize_address_key first, so tokens are uppercase and
+# space-separated and [A-Z] suffices for the suffix.
+_NUMBER_RE = re.compile(r"\d+[A-Z]?")
 _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
 # A house number is a strong disambiguator, so a numbered, number-compatible
 # pair may match on a lower address-similarity score than a number-less one
@ -61,8 +66,10 @@ def fuzzy_join_on_postcode(
    columns (index, address, postcode) via projection pushdown, and the
    final join reads the remaining columns lazily.

-    Returns a LazyFrame with all left and right columns.  Unmatched rows
-    have null right columns.
+    Returns a LazyFrame with all left and right columns, plus a
+    ``_match_score`` (UInt8) audit column holding the token_sort_ratio of
+    the accepted match (exact matches score 100).  Unmatched rows have null
+    right columns and a null score.
    """

    tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
@ -152,14 +159,17 @@ def fuzzy_join_on_postcode(
        # Sort descending by score so best matches are assigned first
        all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)

-        matches: list[tuple[int, int]] = []
+        # Keep the score alongside each accepted pair: it is emitted as the
+        # _match_score audit column so downstream consumers can distinguish
+        # exact (100) from looser fuzzy matches.
+        matches: list[tuple[int, int, int]] = []  # (left_idx, right_idx, score)
        matched_left: set[int] = set()
        matched_right: set[int] = set()

-        for _score, left_idx, right_idx in all_pairs:
+        for score, left_idx, right_idx in all_pairs:
            if left_idx in matched_left or right_idx in matched_right:
                continue
-            matches.append((left_idx, right_idx))
+            matches.append((left_idx, right_idx, score))
            matched_left.add(left_idx)
            matched_right.add(right_idx)

@ -171,6 +181,7 @@ def fuzzy_join_on_postcode(
                {
                    "_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
                    "_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
+                    "_match_score": pl.Series([m[2] for m in matches], dtype=pl.UInt8),
                }
            )
        else:
@ -178,6 +189,7 @@ def fuzzy_join_on_postcode(
                {
                    "_left_idx": pl.Series([], dtype=pl.UInt32),
                    "_right_idx": pl.Series([], dtype=pl.UInt32),
+                    "_match_score": pl.Series([], dtype=pl.UInt8),
                }
            )

@ -197,18 +209,26 @@ def fuzzy_join_on_postcode(


 def _numbers_compatible(a: str, b: str) -> bool:
-    """Check that numeric tokens (flat/house numbers) in the shorter set are a subset of the longer.
+    """Check that the number tokens (house/flat numbers, including any letter
+    suffix) of two addresses are IDENTICAL sets.

-    Returns False if one address has numbers and the other doesn't.
+    Equality, not subset: subset logic let "188 GREAT NORTH WAY" absorb
+    "FLAT 1 188 GREAT NORTH WAY" ({188} is a subset of {1, 188}), attaching a
+    single flat's EPC facts to the whole building — tens of thousands of
+    wrong-property matches. Likewise digit-only tokens made "8A" and "8B"
+    both look like {8} and match each other (and plain "8"). Precision over
+    recall: a pair whose two sources genuinely disagree on number tokens is
+    safer left unmatched.
+
+    One side numbered, the other not -> incompatible. Neither numbered ->
+    compatible; such pairs are scored against the stricter no-numbers
+    threshold instead.
    """
    nums_a = set(_NUMBER_RE.findall(a))
    nums_b = set(_NUMBER_RE.findall(b))
-    smaller, larger = (
-        (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
-    )
-    if not smaller and larger:
-        return False
-    return smaller.issubset(larger)
+    if not nums_a and not nums_b:
+        return True
+    return nums_a == nums_b


 def _score_bucket(
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -1,6 +1,7 @@
 import polars as pl

 from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
+from pipeline.utils.fuzzy_join import _numbers_compatible


 def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
@ -219,6 +220,107 @@ def test_fuzzy_join_matches_high_score_number_less_pair():
    assert result["right_address"].to_list() == ["THE OLD RECTORY"]


+def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number():
+    # 8A, 8B and plain 8 are three different properties on the same street;
+    # digit-only extraction collapsed all three to {8} and let them match.
+    assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET")
+    assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET")
+    assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET")
+
+
+def test_numbers_compatible_requires_equal_sets_not_subset():
+    # Subset logic let the whole-building record "188 ..." absorb its flat
+    # "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal.
+    assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY")
+    assert _numbers_compatible(
+        "FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1"
+    )
+
+
+def test_numbers_compatible_number_less_and_one_sided_pairs():
+    # Neither side numbered -> compatible (gated by the stricter no-numbers
+    # score threshold instead); exactly one side numbered -> incompatible.
+    assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE")
+    assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")
+
+
+def test_fuzzy_join_rejects_wrong_letter_suffix_match():
+    # End-to-end guard for the 8A/8B class of wrong-property matches: the only
+    # candidate in the postcode bucket differs solely in the number suffix, so
+    # the row must stay unmatched rather than borrow the neighbour's record.
+    left = pl.LazyFrame(
+        {
+            "left_address": ["8A High Street"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["8B High Street"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == [None]
+
+
+def test_fuzzy_join_emits_match_score_column():
+    # The audit column carries the token_sort_ratio of the accepted match:
+    # 100 for an exact (post-normalisation) address match, the raw fuzzy score
+    # otherwise, and null for unmatched rows.
+    left = pl.LazyFrame(
+        {
+            "left_id": ["exact", "fuzzy", "unmatched"],
+            "left_address": [
+                "10 High Street",
+                "10 Acacia Avenue",
+                "99 Other Road",
+            ],
+            "left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": [
+                "10 HIGH STREET",
+                # Scores exactly 82 against "10 Acacia Avenue" (see
+                # test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
+                "Flat A, 10 Acacia Avenue",
+            ],
+            "right_postcode": ["AB1 2CD", "EF3 4GH"],
+        }
+    )
+
+    result = (
+        fuzzy_join_on_postcode(
+            left=left,
+            right=right,
+            left_address_col="left_address",
+            right_address_col="right_address",
+            left_postcode_col="left_postcode",
+            right_postcode_col="right_postcode",
+        )
+        .sort("left_id")
+        .collect()
+    )
+
+    assert result.schema["_match_score"] == pl.UInt8
+    assert result.select("left_id", "_match_score").to_dicts() == [
+        {"left_id": "exact", "_match_score": 100},
+        {"left_id": "fuzzy", "_match_score": 82},
+        {"left_id": "unmatched", "_match_score": None},
+    ]
+
+
 def test_normalize_postcode_key_requires_full_postcode():
    df = pl.DataFrame(
        {