perfect-postcode/pipeline/utils/fuzzy_join.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

379 lines
16 KiB
Python

import re
import shutil
import tempfile
from collections import Counter
from collections.abc import Sequence
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.normalize import uppercase_alnum_key_expr
# A house-number token includes any letter suffix: 8A, 8B and plain 8 are
# three different properties on the same street, so digit-only extraction
# (which collapsed all three to "8") is not enough. Addresses are passed
# through normalize_address_key first, so tokens are uppercase and
# space-separated and [A-Z] suffices for the suffix.
_NUMBER_RE = re.compile(r"\d+[A-Z]?")
# A single-letter flat designator ("FLAT B", "APARTMENT C") is a house-number-
# grade disambiguator with no digit in it: without this, FLAT B and FLAT D in
# the same building scored ~96 and cross-matched.
_FLAT_LETTER_RE = re.compile(r"\b(?:FLAT|APARTMENT|APT|UNIT) ([A-Z])\b")
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
# A house number is a strong disambiguator, so a numbered, number-compatible
# pair may match on a lower address-similarity score than a number-less one
# (named houses / flats by building name), which must match almost exactly to
# be trusted. Mirrors merge.py's listings convention.
MIN_FUZZY_SCORE = 82
MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
# A score reached only through an address VARIANT (locality appended /
# secondary address lines dropped) accepts a match the primary strings alone
# would reject, so it must clear a near-exact bar: in the miss audit >99% of
# genuine variant recoveries scored 100, while the rare false variant matches
# scored in the 80s.
MIN_VARIANT_SCORE = 90
# Tokens that mark a sub-unit of a building. A variant whose added/dropped
# tokens include one of these could score a single flat's certificate as if it
# were the whole building, so such variants are inadmissible.
_FLAT_TOKENS = {
"FLAT",
"FLATS",
"APARTMENT",
"APT",
"UNIT",
"MAISONETTE",
"STUDIO",
"ROOM",
}
def normalize_address_key(s: pl.Expr) -> pl.Expr:
normalized = uppercase_alnum_key_expr(s)
return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)
def normalize_postcode_key(s: pl.Expr) -> pl.Expr:
normalized = (
s.cast(pl.String)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(normalized.str.contains(_POSTCODE_RE)).then(normalized).otherwise(None)
)
def fuzzy_join_on_postcode(
left: pl.LazyFrame,
right: pl.LazyFrame,
left_address_col: str,
right_address_col: str,
left_postcode_col: str,
right_postcode_col: str,
min_score: int = MIN_FUZZY_SCORE,
min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
left_variant_cols: Sequence[str] = (),
right_variant_cols: Sequence[str] = (),
) -> pl.LazyFrame:
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
Sinks each side to a temporary parquet file so the upstream pipeline
executes only once. The matching phase collects just three narrow
columns (index, address, postcode) via projection pushdown, and the
final join reads the remaining columns lazily.
``left_variant_cols`` / ``right_variant_cols`` name alternative address
columns for the same property (e.g. the EPC's first address line without
its locality suffix, or the price-paid address with its locality
appended). A pair is scored as the best token_sort_ratio over all
admissible variant combinations: source registers frequently disagree
only on a trailing village/locality token, which alone drags short
addresses below the match threshold. The number-compatibility gate is
always evaluated on the primary addresses, and `_admissible_variants`
rejects any variant whose added/dropped tokens carry digits or flat
designators, so a variant can never bypass the gate or score a single
flat as its whole building. Variant-only scores must clear
``MIN_VARIANT_SCORE``.
Returns a LazyFrame with all left and right columns, plus a
``_match_score`` (UInt8) audit column holding the token_sort_ratio of
the accepted match (exact matches score 100). Unmatched rows have null
right columns and a null score.
"""
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / "right.parquet"
try:
# Materialise each side exactly once, with a row index, to temp parquet.
left.with_row_index("_left_idx").sink_parquet(left_path)
right.with_row_index("_right_idx").sink_parquet(right_path)
# Collect only the narrow columns needed for matching (projection pushdown).
left_match = (
pl.scan_parquet(left_path)
.select(
"_left_idx",
normalize_address_key(pl.col(left_address_col)).alias("_left_address"),
normalize_postcode_key(pl.col(left_postcode_col)).alias(
"_left_postcode"
),
*(
normalize_address_key(pl.col(col)).alias(f"_left_variant_{i}")
for i, col in enumerate(left_variant_cols)
),
)
.collect(engine="streaming")
)
right_match = (
pl.scan_parquet(right_path)
.select(
"_right_idx",
normalize_address_key(pl.col(right_address_col)).alias(
"_right_address"
),
normalize_postcode_key(pl.col(right_postcode_col)).alias(
"_right_postcode"
),
*(
normalize_address_key(pl.col(col)).alias(f"_right_variant_{i}")
for i, col in enumerate(right_variant_cols)
),
)
.unique(subset=["_right_address", "_right_postcode"], keep="first")
.collect(engine="streaming")
)
left_variant_names = [f"_left_variant_{i}" for i in range(len(left_variant_cols))]
right_variant_names = [
f"_right_variant_{i}" for i in range(len(right_variant_cols))
]
# Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
for idx, postcode, address, *variants in zip(
right_match["_right_idx"],
right_match["_right_postcode"],
right_match["_right_address"],
*(right_match[name] for name in right_variant_names),
):
if address is not None and postcode is not None:
right_by_postcode.setdefault(postcode, []).append(
(idx, address, _admissible_variants(address, variants))
)
# Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
for idx, postcode, address, *variants in zip(
left_match["_left_idx"],
left_match["_left_postcode"],
left_match["_left_address"],
*(left_match[name] for name in left_variant_names),
):
if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append(
(idx, address, _admissible_variants(address, variants))
)
del left_match, right_match
# Build tasks for each postcode bucket
tasks = [
(
left_entries,
right_by_postcode[postcode],
min_score,
min_score_without_numbers,
)
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
# Score all pairwise matches in parallel, then greedily assign from
# highest score downward so best pairs lock in first.
# Pair tuples are (score, exact, left_idx, right_idx); `exact` marks a
# literally-equal primary pair so it wins greedy ties against a pair
# that merely token-sorts to the same score (e.g. "APARTMENT 3 1 HIGH
# ST" vs "APARTMENT 1 3 HIGH ST" both score 100 against each other's
# certificates, but each has a literal twin).
all_pairs: list[tuple[int, int, int, int]] = []
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc="Fuzzy matching",
):
all_pairs.extend(pairs)
del tasks, left_by_postcode, right_by_postcode
# Sort so the best matches are assigned first: score, then literal
# equality, then stable left-index order.
all_pairs.sort(key=lambda t: (t[0], t[1], -t[2]), reverse=True)
# Keep the score alongside each accepted pair: it is emitted as the
# _match_score audit column so downstream consumers can distinguish
# exact (100) from looser fuzzy matches.
matches: list[tuple[int, int, int]] = [] # (left_idx, right_idx, score)
matched_left: set[int] = set()
matched_right: set[int] = set()
for score, _exact, left_idx, right_idx in all_pairs:
if left_idx in matched_left or right_idx in matched_right:
continue
matches.append((left_idx, right_idx, score))
matched_left.add(left_idx)
matched_right.add(right_idx)
del all_pairs, matched_left, matched_right
# Build a small mapping LazyFrame and join back to the cached parquets.
if matches:
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
"_match_score": pl.Series([m[2] for m in matches], dtype=pl.UInt8),
}
)
else:
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([], dtype=pl.UInt32),
"_right_idx": pl.Series([], dtype=pl.UInt32),
"_match_score": pl.Series([], dtype=pl.UInt8),
}
)
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
result = (
left_cached.join(mapping, on="_left_idx", how="left")
.join(right_cached, on="_right_idx", how="left")
.drop("_left_idx", "_right_idx")
.collect(engine="streaming")
)
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
return result.lazy()
def _number_tokens(address: str) -> set[str]:
tokens = set(_NUMBER_RE.findall(address))
tokens.update(_FLAT_LETTER_RE.findall(address))
return tokens
def _numbers_compatible(a: str, b: str) -> bool:
"""Check that the number tokens (house/flat numbers, including any letter
suffix, plus single-letter flat designators) of two addresses are
IDENTICAL sets.
Equality, not subset: subset logic let "188 GREAT NORTH WAY" absorb
"FLAT 1 188 GREAT NORTH WAY" ({188} is a subset of {1, 188}), attaching a
single flat's EPC facts to the whole building — tens of thousands of
wrong-property matches. Likewise digit-only tokens made "8A" and "8B"
both look like {8} and match each other (and plain "8"), and ungated
letter flats let "FLAT D 39 X ST" cross-match "FLAT F 39 X ST" at ~96.
Precision over recall: a pair whose two sources genuinely disagree on
number tokens is safer left unmatched.
One side numbered, the other not -> incompatible. Neither numbered ->
compatible; such pairs are scored against the stricter no-numbers
threshold instead.
"""
nums_a = _number_tokens(a)
nums_b = _number_tokens(b)
if not nums_a and not nums_b:
return True
return nums_a == nums_b
def _admissible_variants(
primary: str, variants: Sequence[str | None]
) -> tuple[str, ...]:
"""Variants of ``primary`` that are safe to score against the other side.
A variant may only ADD or DROP whole tokens relative to the primary (one
word multiset must contain the other) — never substitute, so a register
row whose address lines disagree with the combined address can't smuggle
in a different street. The number gate runs on the primary addresses
only, so the added/dropped tokens must additionally carry no digits
(house numbers) and no flat designator (a "Flat 1"-style secondary line
dropped from an EPC address would otherwise let a single flat score as
the whole building). The remaining admissible difference is exactly the
harmless kind variants exist for: trailing locality/village/town words.
"""
primary_words = Counter(primary.split())
admissible: list[str] = []
for variant in variants:
if not variant or variant == primary:
continue
variant_words = Counter(variant.split())
if not (variant_words <= primary_words or primary_words <= variant_words):
continue
changed = (primary_words - variant_words) + (variant_words - primary_words)
if any(
any(ch.isdigit() for ch in token) or token in _FLAT_TOKENS
for token in changed
):
continue
admissible.append(variant)
return tuple(dict.fromkeys(admissible))
def _score_bucket(
args: tuple[
list[tuple[int, str, tuple[str, ...]]],
list[tuple[int, str, tuple[str, ...]]],
int,
int,
],
) -> list[tuple[int, int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries, min_score, min_score_without_numbers = args
pairs = []
for left_row, left_address, left_variants in left_entries:
for right_row, right_address, right_variants in right_entries:
if not _numbers_compatible(left_address, right_address):
continue
score = fuzz.token_sort_ratio(left_address, right_address)
# Variant pairs recover same-property matches where one register
# carries a locality suffix the other lacks; a variant-only score
# must clear the near-exact MIN_VARIANT_SCORE bar.
if score < 100 and (left_variants or right_variants):
for left_variant in (left_address, *left_variants):
for right_variant in (right_address, *right_variants):
if (
left_variant is left_address
and right_variant is right_address
):
continue
variant_score = fuzz.token_sort_ratio(
left_variant, right_variant
)
if variant_score >= MIN_VARIANT_SCORE and variant_score > score:
score = variant_score
# Number-less pairs (named houses, building-name flats) lack the
# house-number disambiguator, so require a near-exact match.
threshold = (
min_score
if _NUMBER_RE.search(left_address) or _NUMBER_RE.search(right_address)
else min_score_without_numbers
)
if score >= threshold:
pairs.append(
(score, int(left_address == right_address), left_row, right_row)
)
return pairs