This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -10,15 +10,31 @@ from thefuzz import fuzz
from tqdm import tqdm
_NUMBER_RE = re.compile(r"\d+")
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
MIN_FUZZY_SCORE = 60
def _normalize(s: pl.Expr) -> pl.Expr:
return (
s.str.to_uppercase()
.str.replace_all(r"[,.\-]", " ")
def normalize_address_key(s: pl.Expr) -> pl.Expr:
normalized = (
s.cast(pl.String)
.str.to_uppercase()
.str.replace_all(r"[^0-9A-Z]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
)
return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)
def normalize_postcode_key(s: pl.Expr) -> pl.Expr:
normalized = (
s.cast(pl.String)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(normalized.str.contains(_POSTCODE_RE)).then(normalized).otherwise(None)
)
def fuzzy_join_on_postcode(
@ -28,6 +44,7 @@ def fuzzy_join_on_postcode(
right_address_col: str,
left_postcode_col: str,
right_postcode_col: str,
min_score: int = MIN_FUZZY_SCORE,
) -> pl.LazyFrame:
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
@ -54,11 +71,10 @@ def fuzzy_join_on_postcode(
pl.scan_parquet(left_path)
.select(
"_left_idx",
_normalize(pl.col(left_address_col)).alias("_left_address"),
pl.col(left_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_left_postcode"),
normalize_address_key(pl.col(left_address_col)).alias("_left_address"),
normalize_postcode_key(pl.col(left_postcode_col)).alias(
"_left_postcode"
),
)
.collect(engine="streaming")
)
@ -67,11 +83,12 @@ def fuzzy_join_on_postcode(
pl.scan_parquet(right_path)
.select(
"_right_idx",
_normalize(pl.col(right_address_col)).alias("_right_address"),
pl.col(right_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_right_postcode"),
normalize_address_key(pl.col(right_address_col)).alias(
"_right_address"
),
normalize_postcode_key(pl.col(right_postcode_col)).alias(
"_right_postcode"
),
)
.unique(subset=["_right_address", "_right_postcode"], keep="first")
.collect(engine="streaming")
@ -101,7 +118,7 @@ def fuzzy_join_on_postcode(
# Build tasks for each postcode bucket
tasks = [
(left_entries, right_by_postcode[postcode])
(left_entries, right_by_postcode[postcode], min_score)
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
@ -182,15 +199,16 @@ def _numbers_compatible(a: str, b: str) -> bool:
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries = args
left_entries, right_entries, min_score = args
pairs = []
for left_row, left_address in left_entries:
for right_row, right_address in right_entries:
if not _numbers_compatible(left_address, right_address):
continue
score = fuzz.token_sort_ratio(left_address, right_address)
pairs.append((score, left_row, right_row))
if score >= min_score:
pairs.append((score, left_row, right_row))
return pairs