Fix enrich listing

This commit is contained in:
Andras Schmelczer 2026-06-11 20:15:31 +01:00
parent c2945567d7
commit cf39ad754e
3 changed files with 529 additions and 6 deletions

View file

@ -954,6 +954,173 @@ def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
assert matches["_direct_epc_match_method"].to_list() == ["address"]
def test_match_direct_epc_street_fallback_matches_numberless_listing() -> None:
# A street-level listing address (the Rightmove norm: no house number, no
# UPRN) cannot pass the strict number gate, but must still pick up
# street-representative EPC facts from a same-street certificate in its own
# postcode, labelled with the lower-confidence "street" method.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "7 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_prefers_attribute_agreement() -> None:
# Every same-street certificate ties on street similarity, so the listing's
# attributes (floor area here) must pick the most plausible one.
listings = pl.DataFrame(
[
{
"_listing_idx": 0,
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
"_actual_total_floor_area": 78.0,
}
],
schema={**_LISTING_MATCH_SCHEMA, "_actual_total_floor_area": pl.Float64},
)
matches = _match_direct_epc(
listings,
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_address": "7, Example Road",
"_direct_total_floor_area": 150.0,
},
{
"_direct_epc_row": 1,
"_direct_epc_match_address": "9 EXAMPLE ROAD",
"_direct_epc_address": "9, Example Road",
"_direct_total_floor_area": 80.0,
},
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_spans_postcodes_within_outcode() -> None:
# Long streets cross postcode units. A street-only listing whose own
# postcode has no certificate must still pick up a same-street certificate
# from a sibling postcode in the same outcode.
matches = _match_direct_epc(
_listing_matches(
[
{
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
"_listing_match_postcode": "AA12ZZ",
}
]
),
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_prefers_own_postcode_segment() -> None:
# Within one street, the certificate in the listing's own postcode unit is
# the nearest segment and must win over an equal candidate further along.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_address": "7, Example Road",
"_direct_epc_match_postcode": "AA12ZZ",
},
{
"_direct_epc_row": 1,
"_direct_epc_match_address": "9 EXAMPLE ROAD",
"_direct_epc_address": "9, Example Road",
"_direct_epc_match_postcode": "AA11AA",
},
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
def test_match_direct_epc_street_fallback_recovers_numbered_listing() -> None:
# A numbered listing whose house number has no certificate (number sets
# disjoint, so the strict gate skips every candidate) still picks up a
# street-representative certificate via the fallback.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "17 EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "9 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_rejects_town_only_address() -> None:
# A town-only listing address ("COULSDON SURREY") shares only the locality
# suffix that most street keys in the outcode carry; without a street-name
# anchor it must not subset-inflate onto an arbitrary street.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "COULSDON SURREY"}]),
_direct_epc_candidates(
[
{
"_direct_epc_row": i,
"_direct_epc_match_address": f"{number} {street} SURREY COULSDON",
}
for i, (number, street) in enumerate(
[
("49", "LACKFORD ROAD"),
("12", "CHIPSTEAD VALLEY ROAD"),
("3", "WINDERMERE ROAD"),
]
)
]
),
)
assert matches.height == 0
def test_match_direct_epc_street_fallback_rejects_single_token_query() -> None:
# token_set_ratio scores 100 whenever the query's tokens subset the
# candidate's, so a bare one-token name must not street-match anything.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "KINGSWOOD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "4 KINGSWOOD ROAD"}]),
)
assert matches.height == 0
def test_match_direct_epc_street_fallback_rejects_different_street() -> None:
# The fallback is street-identity within the postcode, not "anything in the
# postcode": a certificate on another street must not match.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "OLDSTEAD ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "5 CAMBRIDGE ROAD"}]),
)
assert matches.height == 0
def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(None) is None
assert _normalize_uprn("") is None
@ -1167,13 +1334,20 @@ def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
assert matches["_property_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
# same-street EPC in a different postcode with no shared UPRN is skipped.
def test_match_direct_epc_does_not_match_other_outcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — and
# the street fallback is outcode-scoped, so a same-street EPC in a different
# OUTCODE with no shared UPRN is skipped.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
_direct_epc_candidates(
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
[
{
"_direct_epc_match_postcode": "BB22BB",
"_direct_epc_outcode": "BB2",
"_direct_epc_uprn": None,
}
]
),
)