This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -134,6 +134,91 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
]
def test_fuzzy_join_rejects_mid_score_number_less_match():
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
# the number-less threshold of 90, so it must NOT match now.
left = pl.LazyFrame(
{
"left_address": ["The Coach House"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["The Old Coach House"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
# house number is compatible, so the numbered baseline (>= 82) still matches.
left = pl.LazyFrame(
{
"left_address": ["10 Acacia Avenue"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["Flat A, 10 Acacia Avenue"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
def test_fuzzy_join_matches_high_score_number_less_pair():
# A number-less pair that clears the 90 threshold (here an exact token match,
# score 100) must still match.
left = pl.LazyFrame(
{
"left_address": ["The Old Rectory"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["THE OLD RECTORY"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
def test_normalize_postcode_key_requires_full_postcode():
df = pl.DataFrame(
{