import polars as pl from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key def test_fuzzy_join_on_postcode_matches_addresses_within_postcode(): left = pl.LazyFrame( { "left_id": ["flat", "house", "unmatched"], "left_address": [ "Flat 2, 10 High Street", "12 High Street", "99 Other Road", ], "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) right = pl.LazyFrame( { "right_id": ["flat_epc", "house_epc", "other_postcode"], "right_address": [ "10 HIGH STREET FLAT 2", "12 High-Street", "99 Other Road", ], "right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"], } ) result = ( fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ) .sort("left_id") .collect() ) assert result.select("left_id", "right_id").to_dicts() == [ {"left_id": "flat", "right_id": "flat_epc"}, {"left_id": "house", "right_id": "house_epc"}, {"left_id": "unmatched", "right_id": None}, ] def test_fuzzy_join_on_postcode_requires_matching_numbers(): left = pl.LazyFrame( { "left_address": ["10 High Street"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["11 High Street"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches(): left = pl.LazyFrame( { "left_address": ["1 Example Street"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["1 Totally Different Road"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys(): left = pl.LazyFrame( { "left_id": ["blank", "number_only", "valid"], "left_address": [" ", "10", "10 High Street"], "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["", "10", "10 High Street"], "right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"], } ) result = ( fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ) .sort("left_id") .collect() ) assert result.select("left_id", "right_address").to_dicts() == [ {"left_id": "blank", "right_address": None}, {"left_id": "number_only", "right_address": None}, {"left_id": "valid", "right_address": "10 High Street"}, ] def test_fuzzy_join_rejects_mid_score_number_less_match(): # "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio: # above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below # the number-less threshold of 90, so it must NOT match now. left = pl.LazyFrame( { "left_address": ["The Coach House"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["The Old Coach House"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == [None] def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold(): # "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the # house number is compatible, so the numbered baseline (>= 82) still matches. left = pl.LazyFrame( { "left_address": ["10 Acacia Avenue"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["Flat A, 10 Acacia Avenue"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"] def test_fuzzy_join_matches_high_score_number_less_pair(): # A number-less pair that clears the 90 threshold (here an exact token match, # score 100) must still match. left = pl.LazyFrame( { "left_address": ["The Old Rectory"], "left_postcode": ["AB1 2CD"], } ) right = pl.LazyFrame( { "right_address": ["THE OLD RECTORY"], "right_postcode": ["AB1 2CD"], } ) result = fuzzy_join_on_postcode( left=left, right=right, left_address_col="left_address", right_address_col="right_address", left_postcode_col="left_postcode", right_postcode_col="right_postcode", ).collect() assert result["right_address"].to_list() == ["THE OLD RECTORY"] def test_normalize_postcode_key_requires_full_postcode(): df = pl.DataFrame( { "postcode": [ " SW1A 1AA ", "sw1a-1aa", "", "SW1A", "12345", "not a postcode", ] } ) result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key")) assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]