perfect-postcode/pipeline/utils/test_fuzzy_join.py
2026-05-13 12:11:54 +01:00

134 lines
3.7 KiB
Python

import polars as pl
from pipeline.utils import fuzzy_join_on_postcode
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
left = pl.LazyFrame(
{
"left_id": ["flat", "house", "unmatched"],
"left_address": [
"Flat 2, 10 High Street",
"12 High Street",
"99 Other Road",
],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_id": ["flat_epc", "house_epc", "other_postcode"],
"right_address": [
"10 HIGH STREET FLAT 2",
"12 High-Street",
"99 Other Road",
],
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.select("left_id", "right_id").to_dicts() == [
{"left_id": "flat", "right_id": "flat_epc"},
{"left_id": "house", "right_id": "house_epc"},
{"left_id": "unmatched", "right_id": None},
]
def test_fuzzy_join_on_postcode_requires_matching_numbers():
left = pl.LazyFrame(
{
"left_address": ["10 High Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["11 High Street"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
left = pl.LazyFrame(
{
"left_address": ["1 Example Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["1 Totally Different Road"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
left = pl.LazyFrame(
{
"left_id": ["blank", "number_only", "valid"],
"left_address": [" ", "10", "10 High Street"],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["", "10", "10 High Street"],
"right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.select("left_id", "right_address").to_dicts() == [
{"left_id": "blank", "right_address": None},
{"left_id": "number_only", "right_address": None},
{"left_id": "valid", "right_address": "10 High Street"},
]