perfect-postcode/pipeline/utils/test_fuzzy_join.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

485 lines
16 KiB
Python

import polars as pl
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
from pipeline.utils.fuzzy_join import _admissible_variants, _numbers_compatible
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
left = pl.LazyFrame(
{
"left_id": ["flat", "house", "unmatched"],
"left_address": [
"Flat 2, 10 High Street",
"12 High Street",
"99 Other Road",
],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_id": ["flat_epc", "house_epc", "other_postcode"],
"right_address": [
"10 HIGH STREET FLAT 2",
"12 High-Street",
"99 Other Road",
],
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.select("left_id", "right_id").to_dicts() == [
{"left_id": "flat", "right_id": "flat_epc"},
{"left_id": "house", "right_id": "house_epc"},
{"left_id": "unmatched", "right_id": None},
]
def test_fuzzy_join_on_postcode_requires_matching_numbers():
left = pl.LazyFrame(
{
"left_address": ["10 High Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["11 High Street"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
left = pl.LazyFrame(
{
"left_address": ["1 Example Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["1 Totally Different Road"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
left = pl.LazyFrame(
{
"left_id": ["blank", "number_only", "valid"],
"left_address": [" ", "10", "10 High Street"],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["", "10", "10 High Street"],
"right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.select("left_id", "right_address").to_dicts() == [
{"left_id": "blank", "right_address": None},
{"left_id": "number_only", "right_address": None},
{"left_id": "valid", "right_address": "10 High Street"},
]
def test_fuzzy_join_rejects_mid_score_number_less_match():
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
# the number-less threshold of 90, so it must NOT match now.
left = pl.LazyFrame(
{
"left_address": ["The Coach House"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["The Old Coach House"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
# "10 ACACIA AVENUE" vs "10 ACACIA AVENUE OAKHAM" scores exactly 82 and the
# house number is compatible, so the numbered baseline (>= 82) still matches.
left = pl.LazyFrame(
{
"left_address": ["10 Acacia Avenue"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["10 Acacia Avenue, Oakham"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == ["10 Acacia Avenue, Oakham"]
def test_fuzzy_join_matches_high_score_number_less_pair():
# A number-less pair that clears the 90 threshold (here an exact token match,
# score 100) must still match.
left = pl.LazyFrame(
{
"left_address": ["The Old Rectory"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["THE OLD RECTORY"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number():
# 8A, 8B and plain 8 are three different properties on the same street;
# digit-only extraction collapsed all three to {8} and let them match.
assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET")
assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET")
assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET")
def test_numbers_compatible_requires_equal_sets_not_subset():
# Subset logic let the whole-building record "188 ..." absorb its flat
# "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal.
assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY")
assert _numbers_compatible(
"FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1"
)
def test_numbers_compatible_number_less_and_one_sided_pairs():
# Neither side numbered -> compatible (gated by the stricter no-numbers
# score threshold instead); exactly one side numbered -> incompatible.
assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE")
assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")
def test_numbers_compatible_gates_single_letter_flats():
# "FLAT D" and "FLAT F" are different flats even with identical street
# numbers; ungated they token_sort to ~96 and cross-matched. The letter is
# a pseudo-number token, so it also blocks a flat matching the bare
# building address.
assert not _numbers_compatible(
"FLAT D 39 GERTRUDE STREET", "FLAT F 39 GERTRUDE STREET"
)
assert _numbers_compatible(
"FLAT D 39 GERTRUDE STREET", "39 GERTRUDE STREET FLAT D"
)
assert not _numbers_compatible("FLAT B ROSE COURT", "ROSE COURT")
# A letter glued to a number ("A3") is a unit name, not a flat letter.
assert _numbers_compatible("FLAT A3 CHESHAM HEIGHTS", "FLAT A3 CHESHAM HEIGHTS")
def test_admissible_variants_allows_locality_suffix_only():
# Locality words may differ between a variant and its primary; digits and
# flat designators may not (the gate ran on the primary only).
assert _admissible_variants(
"12 OAK ROAD", ["12 OAK ROAD HALE", "12 OAK ROAD"]
) == ("12 OAK ROAD HALE",)
# Dropping "FLAT 1" (digit) or "FLAT B" (flat designator) is inadmissible:
# the variant would score a single flat as the whole building.
assert (
_admissible_variants("FLAT 1 188 GREAT NORTH WAY", ["188 GREAT NORTH WAY"])
== ()
)
assert _admissible_variants("FLAT B ROSE COURT", ["ROSE COURT"]) == ()
assert _admissible_variants("12 OAK ROAD", [None, "12 OAK ROAD"]) == ()
# Substitution is never admissible: a register row whose address1
# disagrees with the combined address must not smuggle in a different
# street for scoring.
assert _admissible_variants("12 OAK ROAD", ["12 ELM ROAD"]) == ()
assert (
_admissible_variants("1 TOTALLY DIFFERENT ROAD", ["1 EXAMPLE STREET"]) == ()
)
def test_fuzzy_join_variant_recovers_locality_suffix_mismatch():
# The EPC register stores "12 Oak Road, Hale" (address1 + locality line)
# while price-paid has the bare "12 Oak Road": token_sort scores 81 < 82
# and the match was lost. The EPC's address1-only variant scores 100.
left = pl.LazyFrame(
{
"left_address": ["12 Oak Road"],
"left_postcode": ["AB1 2CD"],
"left_with_locality": ["12 Oak Road Hale"],
}
)
right = pl.LazyFrame(
{
"right_address": ["12 Oak Road, Hale"],
"right_postcode": ["AB1 2CD"],
"right_address1": ["12 Oak Road"],
}
)
unmatched = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert unmatched["_match_score"].to_list() == [None]
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
left_variant_cols=["left_with_locality"],
right_variant_cols=["right_address1"],
).collect()
assert result["_match_score"].to_list() == [100]
def test_fuzzy_join_variant_cannot_unlock_a_flat_for_its_building():
# The EPC's secondary line carries the flat designator; dropping it would
# score the flat's certificate 100 against the whole-building price-paid
# address. The variant must be ruled inadmissible and the pair unmatched.
left = pl.LazyFrame(
{
"left_address": ["188 Great North Way"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["Flat 1, 188 Great North Way"],
"right_postcode": ["AB1 2CD"],
"right_address1": ["188 Great North Way"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
right_variant_cols=["right_address1"],
).collect()
assert result["_match_score"].to_list() == [None]
def test_fuzzy_join_variant_score_must_be_near_exact():
# A score reached only through a variant must clear MIN_VARIANT_SCORE
# (90): "2 MYRTLE COTTAGES" vs "2 LEITH VIEW COTTAGES" type pairs scored
# in the 80s via variants and were false matches.
left = pl.LazyFrame(
{
"left_address": ["2 Myrtle Cottages"],
"left_postcode": ["AB1 2CD"],
"left_with_locality": ["2 Myrtle Cottages Dorking"],
}
)
right = pl.LazyFrame(
{
"right_address": ["2 Leith View Cottages, North Holmwood"],
"right_postcode": ["AB1 2CD"],
"right_address1": ["2 Leith View Cottages"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
left_variant_cols=["left_with_locality"],
right_variant_cols=["right_address1"],
).collect()
assert result["_match_score"].to_list() == [None]
def test_fuzzy_join_rejects_wrong_letter_suffix_match():
# End-to-end guard for the 8A/8B class of wrong-property matches: the only
# candidate in the postcode bucket differs solely in the number suffix, so
# the row must stay unmatched rather than borrow the neighbour's record.
left = pl.LazyFrame(
{
"left_address": ["8A High Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["8B High Street"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_emits_match_score_column():
# The audit column carries the token_sort_ratio of the accepted match:
# 100 for an exact (post-normalisation) address match, the raw fuzzy score
# otherwise, and null for unmatched rows.
left = pl.LazyFrame(
{
"left_id": ["exact", "fuzzy", "unmatched"],
"left_address": [
"10 High Street",
"10 Acacia Avenue",
"99 Other Road",
],
"left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"],
}
)
right = pl.LazyFrame(
{
"right_address": [
"10 HIGH STREET",
# Scores exactly 82 against "10 Acacia Avenue" (see
# test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
"10 Acacia Avenue, Oakham",
],
"right_postcode": ["AB1 2CD", "EF3 4GH"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.schema["_match_score"] == pl.UInt8
assert result.select("left_id", "_match_score").to_dicts() == [
{"left_id": "exact", "_match_score": 100},
{"left_id": "fuzzy", "_match_score": 82},
{"left_id": "unmatched", "_match_score": None},
]
def test_normalize_postcode_key_requires_full_postcode():
df = pl.DataFrame(
{
"postcode": [
" SW1A 1AA ",
"sw1a-1aa",
"",
"SW1A",
"12345",
"not a postcode",
]
}
)
result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key"))
assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]