Fable findings in data
This commit is contained in:
parent
b98bc6d611
commit
6a33b03fdf
20 changed files with 1502 additions and 274 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
|
||||
from pipeline.utils.fuzzy_join import _numbers_compatible
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
|
|
@ -219,6 +220,107 @@ def test_fuzzy_join_matches_high_score_number_less_pair():
|
|||
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
|
||||
|
||||
|
||||
def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number():
|
||||
# 8A, 8B and plain 8 are three different properties on the same street;
|
||||
# digit-only extraction collapsed all three to {8} and let them match.
|
||||
assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET")
|
||||
assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET")
|
||||
assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET")
|
||||
|
||||
|
||||
def test_numbers_compatible_requires_equal_sets_not_subset():
|
||||
# Subset logic let the whole-building record "188 ..." absorb its flat
|
||||
# "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal.
|
||||
assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY")
|
||||
assert _numbers_compatible(
|
||||
"FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1"
|
||||
)
|
||||
|
||||
|
||||
def test_numbers_compatible_number_less_and_one_sided_pairs():
|
||||
# Neither side numbered -> compatible (gated by the stricter no-numbers
|
||||
# score threshold instead); exactly one side numbered -> incompatible.
|
||||
assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE")
|
||||
assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")
|
||||
|
||||
|
||||
def test_fuzzy_join_rejects_wrong_letter_suffix_match():
|
||||
# End-to-end guard for the 8A/8B class of wrong-property matches: the only
|
||||
# candidate in the postcode bucket differs solely in the number suffix, so
|
||||
# the row must stay unmatched rather than borrow the neighbour's record.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["8A High Street"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["8B High Street"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_emits_match_score_column():
|
||||
# The audit column carries the token_sort_ratio of the accepted match:
|
||||
# 100 for an exact (post-normalisation) address match, the raw fuzzy score
|
||||
# otherwise, and null for unmatched rows.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_id": ["exact", "fuzzy", "unmatched"],
|
||||
"left_address": [
|
||||
"10 High Street",
|
||||
"10 Acacia Avenue",
|
||||
"99 Other Road",
|
||||
],
|
||||
"left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": [
|
||||
"10 HIGH STREET",
|
||||
# Scores exactly 82 against "10 Acacia Avenue" (see
|
||||
# test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
|
||||
"Flat A, 10 Acacia Avenue",
|
||||
],
|
||||
"right_postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
}
|
||||
)
|
||||
|
||||
result = (
|
||||
fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
)
|
||||
.sort("left_id")
|
||||
.collect()
|
||||
)
|
||||
|
||||
assert result.schema["_match_score"] == pl.UInt8
|
||||
assert result.select("left_id", "_match_score").to_dicts() == [
|
||||
{"left_id": "exact", "_match_score": 100},
|
||||
{"left_id": "fuzzy", "_match_score": 82},
|
||||
{"left_id": "unmatched", "_match_score": None},
|
||||
]
|
||||
|
||||
|
||||
def test_normalize_postcode_key_requires_full_postcode():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue