238 lines
6.7 KiB
Python
238 lines
6.7 KiB
Python
import polars as pl
|
|
|
|
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
|
|
|
|
|
|
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_id": ["flat", "house", "unmatched"],
|
|
"left_address": [
|
|
"Flat 2, 10 High Street",
|
|
"12 High Street",
|
|
"99 Other Road",
|
|
],
|
|
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_id": ["flat_epc", "house_epc", "other_postcode"],
|
|
"right_address": [
|
|
"10 HIGH STREET FLAT 2",
|
|
"12 High-Street",
|
|
"99 Other Road",
|
|
],
|
|
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
|
|
}
|
|
)
|
|
|
|
result = (
|
|
fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
)
|
|
.sort("left_id")
|
|
.collect()
|
|
)
|
|
|
|
assert result.select("left_id", "right_id").to_dicts() == [
|
|
{"left_id": "flat", "right_id": "flat_epc"},
|
|
{"left_id": "house", "right_id": "house_epc"},
|
|
{"left_id": "unmatched", "right_id": None},
|
|
]
|
|
|
|
|
|
def test_fuzzy_join_on_postcode_requires_matching_numbers():
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_address": ["10 High Street"],
|
|
"left_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["11 High Street"],
|
|
"right_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
).collect()
|
|
|
|
assert result["right_address"].to_list() == [None]
|
|
|
|
|
|
def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_address": ["1 Example Street"],
|
|
"left_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["1 Totally Different Road"],
|
|
"right_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
).collect()
|
|
|
|
assert result["right_address"].to_list() == [None]
|
|
|
|
|
|
def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_id": ["blank", "number_only", "valid"],
|
|
"left_address": [" ", "10", "10 High Street"],
|
|
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["", "10", "10 High Street"],
|
|
"right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = (
|
|
fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
)
|
|
.sort("left_id")
|
|
.collect()
|
|
)
|
|
|
|
assert result.select("left_id", "right_address").to_dicts() == [
|
|
{"left_id": "blank", "right_address": None},
|
|
{"left_id": "number_only", "right_address": None},
|
|
{"left_id": "valid", "right_address": "10 High Street"},
|
|
]
|
|
|
|
|
|
def test_fuzzy_join_rejects_mid_score_number_less_match():
|
|
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
|
|
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
|
|
# the number-less threshold of 90, so it must NOT match now.
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_address": ["The Coach House"],
|
|
"left_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["The Old Coach House"],
|
|
"right_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
).collect()
|
|
|
|
assert result["right_address"].to_list() == [None]
|
|
|
|
|
|
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
|
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
|
|
# house number is compatible, so the numbered baseline (>= 82) still matches.
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_address": ["10 Acacia Avenue"],
|
|
"left_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["Flat A, 10 Acacia Avenue"],
|
|
"right_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
).collect()
|
|
|
|
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
|
|
|
|
|
|
def test_fuzzy_join_matches_high_score_number_less_pair():
|
|
# A number-less pair that clears the 90 threshold (here an exact token match,
|
|
# score 100) must still match.
|
|
left = pl.LazyFrame(
|
|
{
|
|
"left_address": ["The Old Rectory"],
|
|
"left_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
right = pl.LazyFrame(
|
|
{
|
|
"right_address": ["THE OLD RECTORY"],
|
|
"right_postcode": ["AB1 2CD"],
|
|
}
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=left,
|
|
right=right,
|
|
left_address_col="left_address",
|
|
right_address_col="right_address",
|
|
left_postcode_col="left_postcode",
|
|
right_postcode_col="right_postcode",
|
|
).collect()
|
|
|
|
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
|
|
|
|
|
|
def test_normalize_postcode_key_requires_full_postcode():
|
|
df = pl.DataFrame(
|
|
{
|
|
"postcode": [
|
|
" SW1A 1AA ",
|
|
"sw1a-1aa",
|
|
"",
|
|
"SW1A",
|
|
"12345",
|
|
"not a postcode",
|
|
]
|
|
}
|
|
)
|
|
|
|
result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key"))
|
|
|
|
assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]
|