idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -134,6 +134,91 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
|||
]
|
||||
|
||||
|
||||
def test_fuzzy_join_rejects_mid_score_number_less_match():
|
||||
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
|
||||
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
|
||||
# the number-less threshold of 90, so it must NOT match now.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["The Coach House"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["The Old Coach House"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
||||
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
|
||||
# house number is compatible, so the numbered baseline (>= 82) still matches.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["10 Acacia Avenue"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["Flat A, 10 Acacia Avenue"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
|
||||
|
||||
|
||||
def test_fuzzy_join_matches_high_score_number_less_pair():
|
||||
# A number-less pair that clears the 90 threshold (here an exact token match,
|
||||
# score 100) must still match.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["The Old Rectory"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["THE OLD RECTORY"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
|
||||
|
||||
|
||||
def test_normalize_postcode_key_requires_full_postcode():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue