import polars as pl

from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
from pipeline.utils.fuzzy_join import _numbers_compatible


def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
    left = pl.LazyFrame(
        {
            "left_id": ["flat", "house", "unmatched"],
            "left_address": [
                "Flat 2, 10 High Street",
                "12 High Street",
                "99 Other Road",
            ],
            "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_id": ["flat_epc", "house_epc", "other_postcode"],
            "right_address": [
                "10 HIGH STREET FLAT 2",
                "12 High-Street",
                "99 Other Road",
            ],
            "right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
        }
    )

    result = (
        fuzzy_join_on_postcode(
            left=left,
            right=right,
            left_address_col="left_address",
            right_address_col="right_address",
            left_postcode_col="left_postcode",
            right_postcode_col="right_postcode",
        )
        .sort("left_id")
        .collect()
    )

    assert result.select("left_id", "right_id").to_dicts() == [
        {"left_id": "flat", "right_id": "flat_epc"},
        {"left_id": "house", "right_id": "house_epc"},
        {"left_id": "unmatched", "right_id": None},
    ]


def test_fuzzy_join_on_postcode_requires_matching_numbers():
    left = pl.LazyFrame(
        {
            "left_address": ["10 High Street"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["11 High Street"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == [None]


def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
    left = pl.LazyFrame(
        {
            "left_address": ["1 Example Street"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["1 Totally Different Road"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == [None]


def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
    left = pl.LazyFrame(
        {
            "left_id": ["blank", "number_only", "valid"],
            "left_address": ["   ", "10", "10 High Street"],
            "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["", "10", "10 High Street"],
            "right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
        }
    )

    result = (
        fuzzy_join_on_postcode(
            left=left,
            right=right,
            left_address_col="left_address",
            right_address_col="right_address",
            left_postcode_col="left_postcode",
            right_postcode_col="right_postcode",
        )
        .sort("left_id")
        .collect()
    )

    assert result.select("left_id", "right_address").to_dicts() == [
        {"left_id": "blank", "right_address": None},
        {"left_id": "number_only", "right_address": None},
        {"left_id": "valid", "right_address": "10 High Street"},
    ]


def test_fuzzy_join_rejects_mid_score_number_less_match():
    # "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
    # above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
    # the number-less threshold of 90, so it must NOT match now.
    left = pl.LazyFrame(
        {
            "left_address": ["The Coach House"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["The Old Coach House"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == [None]


def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
    # "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
    # house number is compatible, so the numbered baseline (>= 82) still matches.
    left = pl.LazyFrame(
        {
            "left_address": ["10 Acacia Avenue"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["Flat A, 10 Acacia Avenue"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]


def test_fuzzy_join_matches_high_score_number_less_pair():
    # A number-less pair that clears the 90 threshold (here an exact token match,
    # score 100) must still match.
    left = pl.LazyFrame(
        {
            "left_address": ["The Old Rectory"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["THE OLD RECTORY"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == ["THE OLD RECTORY"]


def test_numbers_compatible_treats_letter_suffix_as_part_of_the_number():
    # 8A, 8B and plain 8 are three different properties on the same street;
    # digit-only extraction collapsed all three to {8} and let them match.
    assert not _numbers_compatible("8A HIGH STREET", "8B HIGH STREET")
    assert not _numbers_compatible("8A HIGH STREET", "8 HIGH STREET")
    assert _numbers_compatible("8A HIGH STREET", "8A HIGH STREET")


def test_numbers_compatible_requires_equal_sets_not_subset():
    # Subset logic let the whole-building record "188 ..." absorb its flat
    # "FLAT 1 188 ..." ({188} is a subset of {1, 188}); the sets must be equal.
    assert not _numbers_compatible("FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY")
    assert _numbers_compatible(
        "FLAT 1 188 GREAT NORTH WAY", "188 GREAT NORTH WAY FLAT 1"
    )


def test_numbers_compatible_number_less_and_one_sided_pairs():
    # Neither side numbered -> compatible (gated by the stricter no-numbers
    # score threshold instead); exactly one side numbered -> incompatible.
    assert _numbers_compatible("ROSE COTTAGE", "ROSE COTTAGE")
    assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")


def test_fuzzy_join_rejects_wrong_letter_suffix_match():
    # End-to-end guard for the 8A/8B class of wrong-property matches: the only
    # candidate in the postcode bucket differs solely in the number suffix, so
    # the row must stay unmatched rather than borrow the neighbour's record.
    left = pl.LazyFrame(
        {
            "left_address": ["8A High Street"],
            "left_postcode": ["AB1 2CD"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": ["8B High Street"],
            "right_postcode": ["AB1 2CD"],
        }
    )

    result = fuzzy_join_on_postcode(
        left=left,
        right=right,
        left_address_col="left_address",
        right_address_col="right_address",
        left_postcode_col="left_postcode",
        right_postcode_col="right_postcode",
    ).collect()

    assert result["right_address"].to_list() == [None]


def test_fuzzy_join_emits_match_score_column():
    # The audit column carries the token_sort_ratio of the accepted match:
    # 100 for an exact (post-normalisation) address match, the raw fuzzy score
    # otherwise, and null for unmatched rows.
    left = pl.LazyFrame(
        {
            "left_id": ["exact", "fuzzy", "unmatched"],
            "left_address": [
                "10 High Street",
                "10 Acacia Avenue",
                "99 Other Road",
            ],
            "left_postcode": ["AB1 2CD", "EF3 4GH", "ZZ9 9ZZ"],
        }
    )
    right = pl.LazyFrame(
        {
            "right_address": [
                "10 HIGH STREET",
                # Scores exactly 82 against "10 Acacia Avenue" (see
                # test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
                "Flat A, 10 Acacia Avenue",
            ],
            "right_postcode": ["AB1 2CD", "EF3 4GH"],
        }
    )

    result = (
        fuzzy_join_on_postcode(
            left=left,
            right=right,
            left_address_col="left_address",
            right_address_col="right_address",
            left_postcode_col="left_postcode",
            right_postcode_col="right_postcode",
        )
        .sort("left_id")
        .collect()
    )

    assert result.schema["_match_score"] == pl.UInt8
    assert result.select("left_id", "_match_score").to_dicts() == [
        {"left_id": "exact", "_match_score": 100},
        {"left_id": "fuzzy", "_match_score": 82},
        {"left_id": "unmatched", "_match_score": None},
    ]


def test_normalize_postcode_key_requires_full_postcode():
    df = pl.DataFrame(
        {
            "postcode": [
                " SW1A 1AA ",
                "sw1a-1aa",
                "",
                "SW1A",
                "12345",
                "not a postcode",
            ]
        }
    )

    result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key"))

    assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]