This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -13,7 +13,12 @@ from pipeline.local_temp import local_tmp_dir
_NUMBER_RE = re.compile(r"\d+")
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
MIN_FUZZY_SCORE = 60
# A house number is a strong disambiguator, so a numbered, number-compatible
# pair may match on a lower address-similarity score than a number-less one
# (named houses / flats by building name), which must match almost exactly to
# be trusted. Mirrors merge.py's listings convention.
MIN_FUZZY_SCORE = 82
MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
def normalize_address_key(s: pl.Expr) -> pl.Expr:
@ -47,6 +52,7 @@ def fuzzy_join_on_postcode(
left_postcode_col: str,
right_postcode_col: str,
min_score: int = MIN_FUZZY_SCORE,
min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
) -> pl.LazyFrame:
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
@ -120,7 +126,12 @@ def fuzzy_join_on_postcode(
# Build tasks for each postcode bucket
tasks = [
(left_entries, right_by_postcode[postcode], min_score)
(
left_entries,
right_by_postcode[postcode],
min_score,
min_score_without_numbers,
)
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
@ -201,16 +212,23 @@ def _numbers_compatible(a: str, b: str) -> bool:
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries, min_score = args
left_entries, right_entries, min_score, min_score_without_numbers = args
pairs = []
for left_row, left_address in left_entries:
for right_row, right_address in right_entries:
if not _numbers_compatible(left_address, right_address):
continue
score = fuzz.token_sort_ratio(left_address, right_address)
if score >= min_score:
# Number-less pairs (named houses, building-name flats) lack the
# house-number disambiguator, so require a near-exact match.
threshold = (
min_score
if _NUMBER_RE.search(left_address) or _NUMBER_RE.search(right_address)
else min_score_without_numbers
)
if score >= threshold:
pairs.append((score, left_row, right_row))
return pairs