Fix enrich listing
This commit is contained in:
parent
c2945567d7
commit
cf39ad754e
3 changed files with 529 additions and 6 deletions
|
|
@ -1098,6 +1098,17 @@ def _postcode_outcode_expr(column: str) -> pl.Expr:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$")
|
||||||
|
|
||||||
|
|
||||||
|
def _outcode_of(postcode: str | None) -> str | None:
|
||||||
|
"""Outcode of a compact normalised postcode ("BR15RW" -> "BR1")."""
|
||||||
|
if not postcode:
|
||||||
|
return None
|
||||||
|
match = _OUTCODE_RE.match(postcode)
|
||||||
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
|
||||||
def _canonical_epc_property_type_expr() -> pl.Expr:
|
def _canonical_epc_property_type_expr() -> pl.Expr:
|
||||||
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
||||||
["NO DATA!", "Not Recorded"]
|
["NO DATA!", "Not Recorded"]
|
||||||
|
|
@ -1159,6 +1170,66 @@ def _has_number(address: str | None) -> bool:
|
||||||
return bool(address and _NUMBER_RE.search(address))
|
return bool(address and _NUMBER_RE.search(address))
|
||||||
|
|
||||||
|
|
||||||
|
def _enum_bonus(
|
||||||
|
left: str | None, right: str | None, *, exact: float, mismatch: float
|
||||||
|
) -> float:
|
||||||
|
if not left or not right:
|
||||||
|
return 0.0
|
||||||
|
return exact if left == right else mismatch
|
||||||
|
|
||||||
|
|
||||||
|
def _ratio_bonus(
|
||||||
|
left: float | int | None, right: float | int | None, pct: float, cap: float
|
||||||
|
) -> float:
|
||||||
|
if left is None or right is None:
|
||||||
|
return 0.0
|
||||||
|
try:
|
||||||
|
left_f = float(left)
|
||||||
|
right_f = float(right)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return 0.0
|
||||||
|
if left_f <= 0 or right_f <= 0:
|
||||||
|
return 0.0
|
||||||
|
rel = abs(left_f - right_f) / max(left_f, right_f)
|
||||||
|
if rel > pct:
|
||||||
|
return 0.0
|
||||||
|
return cap * (1.0 - rel / pct)
|
||||||
|
|
||||||
|
|
||||||
|
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
||||||
|
if left is None or right is None:
|
||||||
|
return 0.0
|
||||||
|
try:
|
||||||
|
diff = abs(int(left) - int(right))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return 0.0
|
||||||
|
if diff == 0:
|
||||||
|
return 4.0
|
||||||
|
if diff == 1:
|
||||||
|
return 2.0
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _street_only_address(address: str) -> str:
|
||||||
|
"""The street/locality part of a normalised address: digit-bearing tokens
|
||||||
|
(house numbers, flat numbers, including letter suffixes like 8A) removed."""
|
||||||
|
return " ".join(token for token in address.split() if not _NUMBER_RE.search(token))
|
||||||
|
|
||||||
|
|
||||||
|
def _is_specific_street_query(query: str) -> bool:
|
||||||
|
"""Whether a number-less listing address is specific enough for the
|
||||||
|
street-level fallback. token_set_ratio scores 100 whenever the query's
|
||||||
|
tokens are a subset of the candidate's, so a one-token query (a bare named
|
||||||
|
house like "KINGSWOOD") would match any street containing that word;
|
||||||
|
require at least two substantive tokens ("OLDSTEAD ROAD ...") instead."""
|
||||||
|
substantive = [
|
||||||
|
token
|
||||||
|
for token in query.split()
|
||||||
|
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
|
||||||
|
]
|
||||||
|
return len(substantive) >= 2
|
||||||
|
|
||||||
|
|
||||||
def _normalize_uprn(value: object) -> str | None:
|
def _normalize_uprn(value: object) -> str | None:
|
||||||
"""Canonical UPRN string (digits only) or None.
|
"""Canonical UPRN string (digits only) or None.
|
||||||
|
|
||||||
|
|
@ -1197,6 +1268,8 @@ def _best_listing_match(
|
||||||
`fuzzy_join._score_bucket`. A house number additionally lowers the score
|
`fuzzy_join._score_bucket`. A house number additionally lowers the score
|
||||||
threshold and (via `_address_score`) permits token_set scoring; a number-less
|
threshold and (via `_address_score`) permits token_set scoring; a number-less
|
||||||
address scores on token_sort only and must match the street almost exactly.
|
address scores on token_sort only and must match the street almost exactly.
|
||||||
|
The direct-EPC path layers a street-level fallback on top of this strict
|
||||||
|
matcher — see `_best_street_epc_fallback`.
|
||||||
|
|
||||||
``addressed_fields`` names the candidate columns to fuzzy-match against (a
|
``addressed_fields`` names the candidate columns to fuzzy-match against (a
|
||||||
candidate may carry both a register and an EPC address). Returns
|
candidate may carry both a register and an EPC address). Returns
|
||||||
|
|
@ -1243,6 +1316,120 @@ def _best_listing_match(
|
||||||
return best, float(best_score), "address", best_field
|
return best, float(best_score), "address", best_field
|
||||||
|
|
||||||
|
|
||||||
|
# Ranking bonuses for the street-level direct-EPC fallback. A certificate in
|
||||||
|
# the listing's own postcode unit is the nearest segment of the street, and a
|
||||||
|
# certificate sharing a house-number token with the listing (e.g. listing
|
||||||
|
# "751 753 Cranbrook Road" vs certificate "751 Cranbrook Road", which fails the
|
||||||
|
# strict set-equality gate) is almost certainly the right property — both
|
||||||
|
# should beat a bare attribute-agreement win.
|
||||||
|
_STREET_FALLBACK_SAME_POSTCODE_BONUS = 3.0
|
||||||
|
_STREET_FALLBACK_NUMBER_OVERLAP_BONUS = 8.0
|
||||||
|
|
||||||
|
|
||||||
|
def _best_street_epc_fallback(
|
||||||
|
listing: dict,
|
||||||
|
outcode_streets: dict[str, list[dict]] | None,
|
||||||
|
outcode_noise_tokens: set[str],
|
||||||
|
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]],
|
||||||
|
) -> tuple[dict, float, str, None] | None:
|
||||||
|
"""Street-level direct-EPC fallback for listings the strict matcher missed.
|
||||||
|
|
||||||
|
~90% of scraped listings publish a street-level address only ("Oldstead
|
||||||
|
Road, Bromley" — Rightmove never exposes the house number or UPRN), so the
|
||||||
|
strict matcher in `_best_listing_match` can never match them against the
|
||||||
|
virtually-always-numbered EPC register and their EPC-derived fields
|
||||||
|
(energy rating, interior height, former-council-house flag, construction
|
||||||
|
year) would all be null. Such a listing is instead matched to the best EPC
|
||||||
|
certificate on the SAME STREET in its own OUTCODE: long streets span
|
||||||
|
several postcode units, so postcode-only buckets missed ~43% of otherwise
|
||||||
|
matchable listings (funnel-measured on 2026-06 data). Street identity is
|
||||||
|
token_set_ratio between the digit-stripped halves of both addresses (every
|
||||||
|
same-street certificate scores ~100); qualifying certificates are ranked
|
||||||
|
by attribute agreement (property type, floor area, habitable rooms) plus
|
||||||
|
a same-postcode-unit preference and a house-number-overlap bonus (a
|
||||||
|
numbered listing that failed the strict set-equality gate, e.g. a
|
||||||
|
"751 753" range vs "751", still lands on the right property). The result
|
||||||
|
is street-representative rather than property-exact — hence the distinct
|
||||||
|
"street" method label so downstream consumers can tell the two confidence
|
||||||
|
levels apart. Applied to the direct-EPC join only; the property-register
|
||||||
|
(sale history) join stays strict because a price is property-exact in a
|
||||||
|
way an energy band is not.
|
||||||
|
|
||||||
|
``street_score_cache`` memoises the per-(outcode, query-street) fuzzy scan
|
||||||
|
over the outcode's unique street keys: listings on the same street share
|
||||||
|
the scan, which keeps the full-register run to seconds.
|
||||||
|
"""
|
||||||
|
query = listing.get("_listing_match_address")
|
||||||
|
if not query or not outcode_streets:
|
||||||
|
return None
|
||||||
|
query_street = _street_only_address(query)
|
||||||
|
if not query_street or not _is_specific_street_query(query_street):
|
||||||
|
return None
|
||||||
|
|
||||||
|
outcode = (
|
||||||
|
listing.get("_listing_outcode")
|
||||||
|
or _outcode_of(listing.get("_listing_match_postcode"))
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
cache_key = (outcode, query_street)
|
||||||
|
qualifying = street_score_cache.get(cache_key)
|
||||||
|
if qualifying is None:
|
||||||
|
# A qualifying street must be anchored by a shared token that is NOT a
|
||||||
|
# locality suffix of this outcode (see _index_epc_streets), so a
|
||||||
|
# town-only address can't subset-inflate onto an arbitrary street.
|
||||||
|
query_tokens = set(query_street.split())
|
||||||
|
qualifying = [
|
||||||
|
(score, street)
|
||||||
|
for street in outcode_streets
|
||||||
|
if (query_tokens & set(street.split())) - outcode_noise_tokens
|
||||||
|
and (score := fuzz.token_set_ratio(query_street, street))
|
||||||
|
>= _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||||
|
]
|
||||||
|
street_score_cache[cache_key] = qualifying
|
||||||
|
|
||||||
|
listing_postcode = listing.get("_listing_match_postcode")
|
||||||
|
listing_numbers = set(_NUMBER_RE.findall(query))
|
||||||
|
best: dict | None = None
|
||||||
|
best_total = float("-inf")
|
||||||
|
best_street_score = 0
|
||||||
|
for street_score, street in qualifying:
|
||||||
|
for candidate in outcode_streets[street]:
|
||||||
|
total = float(street_score)
|
||||||
|
total += _enum_bonus(
|
||||||
|
listing.get("_actual_property_type"),
|
||||||
|
candidate.get("_direct_epc_canonical_property_type"),
|
||||||
|
exact=6.0,
|
||||||
|
mismatch=-6.0,
|
||||||
|
)
|
||||||
|
total += _ratio_bonus(
|
||||||
|
listing.get("_actual_total_floor_area"),
|
||||||
|
candidate.get("_direct_total_floor_area"),
|
||||||
|
pct=0.12,
|
||||||
|
cap=8.0,
|
||||||
|
)
|
||||||
|
total += _rooms_bonus(
|
||||||
|
listing.get("_actual_number_habitable_rooms"),
|
||||||
|
candidate.get("_direct_number_habitable_rooms"),
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
listing_postcode
|
||||||
|
and candidate.get("_direct_epc_match_postcode") == listing_postcode
|
||||||
|
):
|
||||||
|
total += _STREET_FALLBACK_SAME_POSTCODE_BONUS
|
||||||
|
if listing_numbers and listing_numbers & set(
|
||||||
|
_NUMBER_RE.findall(candidate.get("_direct_epc_match_address") or "")
|
||||||
|
):
|
||||||
|
total += _STREET_FALLBACK_NUMBER_OVERLAP_BONUS
|
||||||
|
if total > best_total:
|
||||||
|
best_total = total
|
||||||
|
best = candidate
|
||||||
|
best_street_score = street_score
|
||||||
|
|
||||||
|
if best is None:
|
||||||
|
return None
|
||||||
|
return best, float(best_street_score), "street", None
|
||||||
|
|
||||||
|
|
||||||
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
|
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
|
||||||
"""Read the listings parquet and prepare it for the wide-frame merge.
|
"""Read the listings parquet and prepare it for the wide-frame merge.
|
||||||
|
|
||||||
|
|
@ -1616,8 +1803,52 @@ def _match_listing_properties(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _index_epc_streets(
|
||||||
|
epc_candidates: pl.DataFrame,
|
||||||
|
) -> tuple[dict[str, dict[str, list[dict]]], dict[str, set[str]]]:
|
||||||
|
"""Index EPC candidate rows for the street-level fallback.
|
||||||
|
|
||||||
|
Returns ``(streets, noise_tokens)``: ``streets`` maps outcode -> street key
|
||||||
|
-> rows (street key = the digit-stripped match address); ``noise_tokens``
|
||||||
|
maps outcode -> the tokens appearing in at least a quarter of that
|
||||||
|
outcode's street keys. Those are locality suffixes (LONDON, SURREY, the
|
||||||
|
town name) rather than street names, and a fallback match must be anchored
|
||||||
|
by at least one token that is NOT one of them — otherwise a town-only
|
||||||
|
listing address ("COULSDON SURREY") token_set-inflates to 100 against any
|
||||||
|
street key carrying the same locality suffix and matches an arbitrary
|
||||||
|
street in the outcode.
|
||||||
|
"""
|
||||||
|
streets: dict[str, dict[str, list[dict]]] = {}
|
||||||
|
for row in epc_candidates.iter_rows(named=True):
|
||||||
|
outcode = row.get("_direct_epc_outcode")
|
||||||
|
address = row.get("_direct_epc_match_address")
|
||||||
|
if not outcode or not address:
|
||||||
|
continue
|
||||||
|
street = _street_only_address(address)
|
||||||
|
if not street:
|
||||||
|
continue
|
||||||
|
streets.setdefault(outcode, {}).setdefault(street, []).append(row)
|
||||||
|
|
||||||
|
noise_tokens: dict[str, set[str]] = {}
|
||||||
|
for outcode, by_street in streets.items():
|
||||||
|
cutoff = max(2, len(by_street) // 4)
|
||||||
|
counts: dict[str, int] = {}
|
||||||
|
for street in by_street:
|
||||||
|
for token in set(street.split()):
|
||||||
|
counts[token] = counts.get(token, 0) + 1
|
||||||
|
noise_tokens[outcode] = {
|
||||||
|
token for token, count in counts.items() if count >= cutoff
|
||||||
|
}
|
||||||
|
return streets, noise_tokens
|
||||||
|
|
||||||
|
|
||||||
def _best_direct_epc_candidate(
|
def _best_direct_epc_candidate(
|
||||||
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
|
listing: dict,
|
||||||
|
uprn_index: dict[str, dict],
|
||||||
|
candidates: list[dict],
|
||||||
|
outcode_streets: dict[str, list[dict]] | None,
|
||||||
|
outcode_noise_tokens: set[str],
|
||||||
|
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]],
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
result = _best_listing_match(
|
result = _best_listing_match(
|
||||||
listing.get("_listing_uprn"),
|
listing.get("_listing_uprn"),
|
||||||
|
|
@ -1626,6 +1857,10 @@ def _best_direct_epc_candidate(
|
||||||
candidates,
|
candidates,
|
||||||
["_direct_epc_match_address"],
|
["_direct_epc_match_address"],
|
||||||
)
|
)
|
||||||
|
if result is None:
|
||||||
|
result = _best_street_epc_fallback(
|
||||||
|
listing, outcode_streets, outcode_noise_tokens, street_score_cache
|
||||||
|
)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
candidate, score, method, _field = result
|
candidate, score, method, _field = result
|
||||||
|
|
@ -1661,11 +1896,21 @@ def _match_direct_epc(
|
||||||
buckets, uprn_index = _index_candidates(
|
buckets, uprn_index = _index_candidates(
|
||||||
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
|
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
|
||||||
)
|
)
|
||||||
|
street_index, noise_tokens = _index_epc_streets(epc_candidates)
|
||||||
|
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]] = {}
|
||||||
matches = []
|
matches = []
|
||||||
for listing in listing_matches.iter_rows(named=True):
|
for listing in listing_matches.iter_rows(named=True):
|
||||||
postcode = listing.get("_listing_match_postcode")
|
postcode = listing.get("_listing_match_postcode")
|
||||||
bucket = buckets.get(postcode, []) if postcode else []
|
bucket = buckets.get(postcode, []) if postcode else []
|
||||||
match = _best_direct_epc_candidate(listing, uprn_index, bucket)
|
outcode = listing.get("_listing_outcode") or _outcode_of(postcode)
|
||||||
|
match = _best_direct_epc_candidate(
|
||||||
|
listing,
|
||||||
|
uprn_index,
|
||||||
|
bucket,
|
||||||
|
street_index.get(outcode) if outcode else None,
|
||||||
|
noise_tokens.get(outcode, set()) if outcode else set(),
|
||||||
|
street_score_cache,
|
||||||
|
)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
matches.append(match)
|
matches.append(match)
|
||||||
|
|
||||||
|
|
|
||||||
104
pipeline/transform/price_estimation/test_estimate.py
Normal file
104
pipeline/transform/price_estimation/test_estimate.py
Normal file
|
|
@ -0,0 +1,104 @@
|
||||||
|
"""Tests for the floor-area-less estimate guard in estimate.py.
|
||||||
|
|
||||||
|
The per-sqm plausibility guard cannot fire when floor area is null/zero, which
|
||||||
|
let commercial blocks misfiled as dwellings keep absurd headline estimates
|
||||||
|
(e.g. a GBP 175M "Detached" in SW1W). apply_floorless_estimate_guard nulls a
|
||||||
|
floorless estimate only when it exceeds max(FLOORLESS_ESTIMATE_P99_MULT x the
|
||||||
|
district's recent p99 sale price, FLOORLESS_ESTIMATE_MIN_CAP), and leaves
|
||||||
|
rows it cannot judge (no recent district sales) alone.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.transform.price_estimation.estimate import (
|
||||||
|
FLOORLESS_P99_LOOKBACK_YEARS,
|
||||||
|
apply_floorless_estimate_guard,
|
||||||
|
)
|
||||||
|
from pipeline.transform.price_estimation.utils import CURRENT_YEAR
|
||||||
|
|
||||||
|
RECENT = date(CURRENT_YEAR - 1, 6, 1) # inside the p99 look-back window
|
||||||
|
STALE = date(CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS - 5, 6, 1) # outside
|
||||||
|
|
||||||
|
|
||||||
|
def _guard_input(rows):
|
||||||
|
"""Frame with the columns the guard reads, in (id, sector, estimate,
|
||||||
|
floor_area, last_price, last_date) row order. Pool rows (null estimate)
|
||||||
|
only feed the per-district p99 reference."""
|
||||||
|
return pl.DataFrame(
|
||||||
|
rows,
|
||||||
|
schema={
|
||||||
|
"id": pl.Int64,
|
||||||
|
"_sector": pl.String,
|
||||||
|
"Estimated current price": pl.Float64,
|
||||||
|
"Total floor area (sqm)": pl.Float64,
|
||||||
|
"Last known price": pl.Float64,
|
||||||
|
"Date of last transaction": pl.Date,
|
||||||
|
},
|
||||||
|
orient="row",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_for(result: pl.DataFrame, row_id: int):
|
||||||
|
return result.filter(pl.col("id") == row_id)["Estimated current price"][0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_floorless_guard_nulls_and_keeps_the_right_rows():
|
||||||
|
rows = [
|
||||||
|
# SW1W pool: 5 recent sales at 3M -> district p99 = 3M, cap = 6M.
|
||||||
|
*[(100 + i, "SW1W 9", None, None, 3_000_000.0, RECENT) for i in range(5)],
|
||||||
|
# 175M floorless estimate, 29x the 6M cap -> nulled.
|
||||||
|
(1, "SW1W 9", 175_000_000.0, None, None, None),
|
||||||
|
# Zero floor area counts as floorless (psm guard can't fire) -> nulled.
|
||||||
|
(2, "SW1W 8", 175_000_000.0, 0.0, None, None),
|
||||||
|
# 5M floorless is under the 2 x p99 cap -> kept.
|
||||||
|
(3, "SW1W 9", 5_000_000.0, None, None, None),
|
||||||
|
# Floor area PRESENT: never touched by this guard, however absurd
|
||||||
|
# (the per-sqm guard owns that case).
|
||||||
|
(4, "SW1W 9", 175_000_000.0, 93.0, None, None),
|
||||||
|
# ZZ1 pool: cheap district, p99 = 500k -> cap = max(1M, 2M) = 2M.
|
||||||
|
*[(200 + i, "ZZ1 4", None, None, 500_000.0, RECENT) for i in range(5)],
|
||||||
|
# Genuine mansion in a cheap district: above 2 x p99 but below the
|
||||||
|
# absolute 2M floor -> kept.
|
||||||
|
(5, "ZZ1 4", 1_500_000.0, None, None, None),
|
||||||
|
# Above both the absolute floor and 2 x p99 -> nulled.
|
||||||
|
(6, "ZZ1 4", 2_500_000.0, None, None, None),
|
||||||
|
# XX9's only sale is outside the look-back window -> null p99 ->
|
||||||
|
# cannot judge -> kept, even at 50M.
|
||||||
|
(300, "XX9 1", None, None, 4_000_000.0, STALE),
|
||||||
|
(7, "XX9 1", 50_000_000.0, None, None, None),
|
||||||
|
# No sector at all -> no district reference -> kept.
|
||||||
|
(8, None, 50_000_000.0, None, None, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
result = apply_floorless_estimate_guard(_guard_input(rows))
|
||||||
|
|
||||||
|
assert _estimate_for(result, 1) is None
|
||||||
|
assert _estimate_for(result, 2) is None
|
||||||
|
assert _estimate_for(result, 3) == 5_000_000.0
|
||||||
|
assert _estimate_for(result, 4) == 175_000_000.0
|
||||||
|
assert _estimate_for(result, 5) == 1_500_000.0
|
||||||
|
assert _estimate_for(result, 6) is None
|
||||||
|
assert _estimate_for(result, 7) == 50_000_000.0
|
||||||
|
assert _estimate_for(result, 8) == 50_000_000.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_floorless_guard_preserves_schema_and_rows():
|
||||||
|
"""The guard adds no columns, drops no rows, and leaves non-estimate
|
||||||
|
columns untouched (it runs in-pipeline before temp-column dropping)."""
|
||||||
|
df = _guard_input(
|
||||||
|
[
|
||||||
|
(1, "SW1W 9", None, None, 3_000_000.0, RECENT),
|
||||||
|
(2, "SW1W 9", 175_000_000.0, None, None, None),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = apply_floorless_estimate_guard(df)
|
||||||
|
|
||||||
|
assert result.columns == df.columns
|
||||||
|
assert len(result) == len(df)
|
||||||
|
assert result["id"].to_list() == df["id"].to_list()
|
||||||
|
assert result.drop("Estimated current price").equals(
|
||||||
|
df.drop("Estimated current price")
|
||||||
|
)
|
||||||
|
|
@ -954,6 +954,173 @@ def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
|
||||||
assert matches["_direct_epc_match_method"].to_list() == ["address"]
|
assert matches["_direct_epc_match_method"].to_list() == ["address"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_matches_numberless_listing() -> None:
|
||||||
|
# A street-level listing address (the Rightmove norm: no house number, no
|
||||||
|
# UPRN) cannot pass the strict number gate, but must still pick up
|
||||||
|
# street-representative EPC facts from a same-street certificate in its own
|
||||||
|
# postcode, labelled with the lower-confidence "street" method.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
|
||||||
|
_direct_epc_candidates([{"_direct_epc_match_address": "7 EXAMPLE ROAD"}]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["street"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_prefers_attribute_agreement() -> None:
|
||||||
|
# Every same-street certificate ties on street similarity, so the listing's
|
||||||
|
# attributes (floor area here) must pick the most plausible one.
|
||||||
|
listings = pl.DataFrame(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_listing_idx": 0,
|
||||||
|
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
|
||||||
|
"_listing_match_postcode": "AA11AA",
|
||||||
|
"_listing_uprn": None,
|
||||||
|
"_actual_total_floor_area": 78.0,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
schema={**_LISTING_MATCH_SCHEMA, "_actual_total_floor_area": pl.Float64},
|
||||||
|
)
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
listings,
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_direct_epc_match_address": "7 EXAMPLE ROAD",
|
||||||
|
"_direct_epc_address": "7, Example Road",
|
||||||
|
"_direct_total_floor_area": 150.0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_direct_epc_row": 1,
|
||||||
|
"_direct_epc_match_address": "9 EXAMPLE ROAD",
|
||||||
|
"_direct_epc_address": "9, Example Road",
|
||||||
|
"_direct_total_floor_area": 80.0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["street"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_spans_postcodes_within_outcode() -> None:
|
||||||
|
# Long streets cross postcode units. A street-only listing whose own
|
||||||
|
# postcode has no certificate must still pick up a same-street certificate
|
||||||
|
# from a sibling postcode in the same outcode.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
|
||||||
|
"_listing_match_postcode": "AA12ZZ",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
),
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_direct_epc_match_address": "7 EXAMPLE ROAD",
|
||||||
|
"_direct_epc_match_postcode": "AA11AA",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["street"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_prefers_own_postcode_segment() -> None:
|
||||||
|
# Within one street, the certificate in the listing's own postcode unit is
|
||||||
|
# the nearest segment and must win over an equal candidate further along.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_direct_epc_match_address": "7 EXAMPLE ROAD",
|
||||||
|
"_direct_epc_address": "7, Example Road",
|
||||||
|
"_direct_epc_match_postcode": "AA12ZZ",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_direct_epc_row": 1,
|
||||||
|
"_direct_epc_match_address": "9 EXAMPLE ROAD",
|
||||||
|
"_direct_epc_address": "9, Example Road",
|
||||||
|
"_direct_epc_match_postcode": "AA11AA",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_recovers_numbered_listing() -> None:
|
||||||
|
# A numbered listing whose house number has no certificate (number sets
|
||||||
|
# disjoint, so the strict gate skips every candidate) still picks up a
|
||||||
|
# street-representative certificate via the fallback.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "17 EXAMPLE ROAD BROMLEY"}]),
|
||||||
|
_direct_epc_candidates([{"_direct_epc_match_address": "9 EXAMPLE ROAD"}]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["street"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_rejects_town_only_address() -> None:
|
||||||
|
# A town-only listing address ("COULSDON SURREY") shares only the locality
|
||||||
|
# suffix that most street keys in the outcode carry; without a street-name
|
||||||
|
# anchor it must not subset-inflate onto an arbitrary street.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "COULSDON SURREY"}]),
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_direct_epc_row": i,
|
||||||
|
"_direct_epc_match_address": f"{number} {street} SURREY COULSDON",
|
||||||
|
}
|
||||||
|
for i, (number, street) in enumerate(
|
||||||
|
[
|
||||||
|
("49", "LACKFORD ROAD"),
|
||||||
|
("12", "CHIPSTEAD VALLEY ROAD"),
|
||||||
|
("3", "WINDERMERE ROAD"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_rejects_single_token_query() -> None:
|
||||||
|
# token_set_ratio scores 100 whenever the query's tokens subset the
|
||||||
|
# candidate's, so a bare one-token name must not street-match anything.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "KINGSWOOD"}]),
|
||||||
|
_direct_epc_candidates([{"_direct_epc_match_address": "4 KINGSWOOD ROAD"}]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_street_fallback_rejects_different_street() -> None:
|
||||||
|
# The fallback is street-identity within the postcode, not "anything in the
|
||||||
|
# postcode": a certificate on another street must not match.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "OLDSTEAD ROAD BROMLEY"}]),
|
||||||
|
_direct_epc_candidates([{"_direct_epc_match_address": "5 CAMBRIDGE ROAD"}]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 0
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_uprn_handles_types_and_floats() -> None:
|
def test_normalize_uprn_handles_types_and_floats() -> None:
|
||||||
assert _normalize_uprn(None) is None
|
assert _normalize_uprn(None) is None
|
||||||
assert _normalize_uprn("") is None
|
assert _normalize_uprn("") is None
|
||||||
|
|
@ -1167,13 +1334,20 @@ def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
|
||||||
assert matches["_property_match_method"].to_list() == ["uprn"]
|
assert matches["_property_match_method"].to_list() == ["uprn"]
|
||||||
|
|
||||||
|
|
||||||
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
|
def test_match_direct_epc_does_not_match_other_outcode_without_uprn() -> None:
|
||||||
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
|
# Matching is by postcode/UPRN/street — never by coordinate proximity — and
|
||||||
# same-street EPC in a different postcode with no shared UPRN is skipped.
|
# the street fallback is outcode-scoped, so a same-street EPC in a different
|
||||||
|
# OUTCODE with no shared UPRN is skipped.
|
||||||
matches = _match_direct_epc(
|
matches = _match_direct_epc(
|
||||||
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
|
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
|
||||||
_direct_epc_candidates(
|
_direct_epc_candidates(
|
||||||
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
|
[
|
||||||
|
{
|
||||||
|
"_direct_epc_match_postcode": "BB22BB",
|
||||||
|
"_direct_epc_outcode": "BB2",
|
||||||
|
"_direct_epc_uprn": None,
|
||||||
|
}
|
||||||
|
]
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue