Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -1,6 +1,7 @@
 import polars as pl

 from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
+from pipeline.utils.fuzzy_join import _numbers_compatible


 def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
@ -219,6 +220,107 @@ def test_fuzzy_join_matches_high_score_number_less_pair():
    assert result["right_address"].to_list() == ["THE OLD RECTORY"]


+def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number():
+    # 8A, 8B and plain 8 are three different properties on the same street;
+    # digit-only extraction collapsed all three to {8} and let them match.
+    assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET")
+    assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET")
+    assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET")
+
+
+def test_numbers_compatible_requires_equal_sets_not_subset():
+    # Subset logic let the whole-building record "188 ..." absorb its flat
+    # "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal.
+    assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY")
+    assert _numbers_compatible(
+        "FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1"
+    )
+
+
+def test_numbers_compatible_number_less_and_one_sided_pairs():
+    # Neither side numbered -> compatible (gated by the stricter no-numbers
+    # score threshold instead); exactly one side numbered -> incompatible.
+    assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE")
+    assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")
+
+
+def test_fuzzy_join_rejects_wrong_letter_suffix_match():
+    # End-to-end guard for the 8A/8B class of wrong-property matches: the only
+    # candidate in the postcode bucket differs solely in the number suffix, so
+    # the row must stay unmatched rather than borrow the neighbour's record.
+    left = pl.LazyFrame(
+        {
+            "left_address": ["8A High Street"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["8B High Street"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == [None]
+
+
+def test_fuzzy_join_emits_match_score_column():
+    # The audit column carries the token_sort_ratio of the accepted match:
+    # 100 for an exact (post-normalisation) address match, the raw fuzzy score
+    # otherwise, and null for unmatched rows.
+    left = pl.LazyFrame(
+        {
+            "left_id": ["exact", "fuzzy", "unmatched"],
+            "left_address": [
+                "10 High Street",
+                "10 Acacia Avenue",
+                "99 Other Road",
+            ],
+            "left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": [
+                "10 HIGH STREET",
+                # Scores exactly 82 against "10 Acacia Avenue" (see
+                # test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
+                "Flat A, 10 Acacia Avenue",
+            ],
+            "right_postcode": ["AB1 2CD", "EF3 4GH"],
+        }
+    )
+
+    result = (
+        fuzzy_join_on_postcode(
+            left=left,
+            right=right,
+            left_address_col="left_address",
+            right_address_col="right_address",
+            left_postcode_col="left_postcode",
+            right_postcode_col="right_postcode",
+        )
+        .sort("left_id")
+        .collect()
+    )
+
+    assert result.schema["_match_score"] == pl.UInt8
+    assert result.select("left_id", "_match_score").to_dicts() == [
+        {"left_id": "exact", "_match_score": 100},
+        {"left_id": "fuzzy", "_match_score": 82},
+        {"left_id": "unmatched", "_match_score": None},
+    ]
+
+
 def test_normalize_postcode_key_requires_full_postcode():
    df = pl.DataFrame(
        {