import polars as pl from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key from pipeline.utils.fuzzy_join import _admissible_variants, _numbers_compatible def test_fuzzy_join_on_postcode_matches_addresses_within_postcode(): left = pl.LazyFrame( { "left_id": ["flat", "house", "unmatched"], "left_address": [ "Flat 2, 10 High Street", "12 High Street", "99 Other Road", ], "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) right = pl.LazyFrame( { "right_id": ["flat_epc", "house_epc", "other_postcode"], "right_address": [ "10 HIGH STREET FLAT 2", "12 High-Street", "99 Other Road", ], "right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"], } ) result = ( fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ) .sort("left_id") .collect() ) assert result.select("left_id", "right_id").to_dicts() == [ {"left_id": "flat", "right_id": "flat_epc"}, {"left_id": "house", "right_id": "house_epc"}, {"left_id": "unmatched", "right_id": None}, ] def test_fuzzy_join_on_postcode_requires_matching_numbers(): left = pl.LazyFrame( { "left_address": ["10 High Street"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["11 High Street"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches(): left = pl.LazyFrame( { "left_address": ["1 Example Street"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["1 Totally Different Road"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys(): left = pl.LazyFrame( { "left_id": ["blank", "number_only", "valid"], "left_address": [" ", "10", "10 High Street"], "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["", "10", "10 High Street"], "right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) result = ( fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ) .sort("left_id") .collect() ) assert result.select("left_id", "right_address").to_dicts() == [ {"left_id": "blank", "right_address": None}, {"left_id": "number_only", "right_address": None}, {"left_id": "valid", "right_address": "10 High Street"}, ] def test_fuzzy_join_rejects_mid_score_number_less_match(): # "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio: # above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below # the number-less threshold of 90, so it must NOT match now. left = pl.LazyFrame( { "left_address": ["The Coach House"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["The Old Coach House"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold(): # "10 ACACIA AVENUE" vs "10 ACACIA AVENUE OAKHAM" scores exactly 82 and the # house number is compatible, so the numbered baseline (>= 82) still matches. left = pl.LazyFrame( { "left_address": ["10 Acacia Avenue"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["10 Acacia Avenue, Oakham"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == ["10 Acacia Avenue, Oakham"] def test_fuzzy_join_matches_high_score_number_less_pair(): # A number-less pair that clears the 90 threshold (here an exact token match, # score 100) must still match. left = pl.LazyFrame( { "left_address": ["The Old Rectory"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["THE OLD RECTORY"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == ["THE OLD RECTORY"] def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number(): # 8A, 8B and plain 8 are three different properties on the same street; # digit-only extraction collapsed all three to {8} and let them match. assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET") assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET") assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET") def test_numbers_compatible_requires_equal_sets_not_subset(): # Subset logic let the whole-building record "188 ..." absorb its flat # "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal. assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY") assert _numbers_compatible( "FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1" ) def test_numbers_compatible_number_less_and_one_sided_pairs(): # Neither side numbered -> compatible (gated by the stricter no-numbers # score threshold instead); exactly one side numbered -> incompatible. assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE") assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET") def test_numbers_compatible_gates_single_letter_flats(): # "FLAT D" and "FLAT F" are different flats even with identical street # numbers; ungated they token_sort to ~96 and cross-matched. The letter is # a pseudo-number token, so it also blocks a flat matching the bare # building address. assert not _numbers_compatible( "FLAT D 39 GERTRUDE STREET", "FLAT F 39 GERTRUDE STREET" ) assert _numbers_compatible( "FLAT D 39 GERTRUDE STREET", "39 GERTRUDE STREET FLAT D" ) assert not _numbers_compatible("FLAT B ROSE COURT", "ROSE COURT") # A letter glued to a number ("A3") is a unit name, not a flat letter. assert _numbers_compatible("FLAT A3 CHESHAM HEIGHTS", "FLAT A3 CHESHAM HEIGHTS") def test_admissible_variants_allows_locality_suffix_only(): # Locality words may differ between a variant and its primary; digits and # flat designators may not (the gate ran on the primary only). assert _admissible_variants( "12 OAK ROAD", ["12 OAK ROAD HALE", "12 OAK ROAD"] ) == ("12 OAK ROAD HALE",) # Dropping "FLAT 1" (digit) or "FLAT B" (flat designator) is inadmissible: # the variant would score a single flat as the whole building. assert ( _admissible_variants("FLAT 1 188 GREAT NORTH WAY", ["188 GREAT NORTH WAY"]) == () ) assert _admissible_variants("FLAT B ROSE COURT", ["ROSE COURT"]) == () assert _admissible_variants("12 OAK ROAD", [None, "12 OAK ROAD"]) == () # Substitution is never admissible: a register row whose address1 # disagrees with the combined address must not smuggle in a different # street for scoring. assert _admissible_variants("12 OAK ROAD", ["12 ELM ROAD"]) == () assert ( _admissible_variants("1 TOTALLY DIFFERENT ROAD", ["1 EXAMPLE STREET"]) == () ) def test_fuzzy_join_variant_recovers_locality_suffix_mismatch(): # The EPC register stores "12 Oak Road, Hale" (address1 + locality line) # while price-paid has the bare "12 Oak Road": token_sort scores 81 < 82 # and the match was lost. The EPC's address1-only variant scores 100. left = pl.LazyFrame( { "left_address": ["12 Oak Road"], "left_postcode": ["AB1 2CD"], "left_with_locality": ["12 Oak Road Hale"], } ) right = pl.LazyFrame( { "right_address": ["12 Oak Road, Hale"], "right_postcode": ["AB1 2CD"], "right_address1": ["12 Oak Road"], } ) unmatched = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert unmatched["_match_score"].to_list() == [None] result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", left_variant_cols=["left_with_locality"], right_variant_cols=["right_address1"], ).collect() assert result["_match_score"].to_list() == [100] def test_fuzzy_join_variant_cannot_unlock_a_flat_for_its_building(): # The EPC's secondary line carries the flat designator; dropping it would # score the flat's certificate 100 against the whole-building price-paid # address. The variant must be ruled inadmissible and the pair unmatched. left = pl.LazyFrame( { "left_address": ["188 Great North Way"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["Flat 1, 188 Great North Way"], "right_postcode": ["AB1 2CD"], "right_address1": ["188 Great North Way"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", right_variant_cols=["right_address1"], ).collect() assert result["_match_score"].to_list() == [None] def test_fuzzy_join_variant_score_must_be_near_exact(): # A score reached only through a variant must clear MIN_VARIANT_SCORE # (90): "2 MYRTLE COTTAGES" vs "2 LEITH VIEW COTTAGES" type pairs scored # in the 80s via variants and were false matches. left = pl.LazyFrame( { "left_address": ["2 Myrtle Cottages"], "left_postcode": ["AB1 2CD"], "left_with_locality": ["2 Myrtle Cottages Dorking"], } ) right = pl.LazyFrame( { "right_address": ["2 Leith View Cottages, North Holmwood"], "right_postcode": ["AB1 2CD"], "right_address1": ["2 Leith View Cottages"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", left_variant_cols=["left_with_locality"], right_variant_cols=["right_address1"], ).collect() assert result["_match_score"].to_list() == [None] def test_fuzzy_join_rejects_wrong_letter_suffix_match(): # End-to-end guard for the 8A/8B class of wrong-property matches: the only # candidate in the postcode bucket differs solely in the number suffix, so # the row must stay unmatched rather than borrow the neighbour's record. left = pl.LazyFrame( { "left_address": ["8A High Street"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["8B High Street"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_emits_match_score_column(): # The audit column carries the token_sort_ratio of the accepted match: # 100 for an exact (post-normalisation) address match, the raw fuzzy score # otherwise, and null for unmatched rows. left = pl.LazyFrame( { "left_id": ["exact", "fuzzy", "unmatched"], "left_address": [ "10 High Street", "10 Acacia Avenue", "99 Other Road", ], "left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"], } ) right = pl.LazyFrame( { "right_address": [ "10 HIGH STREET", # Scores exactly 82 against "10 Acacia Avenue" (see # test_fuzzy_join_matches_numbered_pair_at_baseline_threshold). "10 Acacia Avenue, Oakham", ], "right_postcode": ["AB1 2CD", "EF3 4GH"], } ) result = ( fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ) .sort("left_id") .collect() ) assert result.schema["_match_score"] == pl.UInt8 assert result.select("left_id", "_match_score").to_dicts() == [ {"left_id": "exact", "_match_score": 100}, {"left_id": "fuzzy", "_match_score": 82}, {"left_id": "unmatched", "_match_score": None}, ] def test_normalize_postcode_key_requires_full_postcode(): df = pl.DataFrame( { "postcode": [ " SW1A 1AA ", "sw1a-1aa", "", "SW1A", "12345", "not a postcode", ] } ) result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key")) assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]