Format python

This commit is contained in:
Andras Schmelczer 2026-01-31 13:07:09 +00:00
parent 85f5770e09
commit 4c258018c3
17 changed files with 348 additions and 248 deletions

View file

@ -9,14 +9,14 @@ import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
_NUMBER_RE = re.compile(r'\d+')
_NUMBER_RE = re.compile(r"\d+")
def _normalize(s: pl.Expr) -> pl.Expr:
return (
s.str.to_uppercase()
.str.replace_all(r'[,.\-]', ' ')
.str.replace_all(r'\s+', ' ')
.str.replace_all(r"[,.\-]", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
)
@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
have null right columns.
"""
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
left_path = Path(tmpdir) / 'left.parquet'
right_path = Path(tmpdir) / 'right.parquet'
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / "right.parquet"
try:
# Materialise each side exactly once, with a row index, to temp parquet.
left.with_row_index('_left_idx').sink_parquet(left_path)
right.with_row_index('_right_idx').sink_parquet(right_path)
left.with_row_index("_left_idx").sink_parquet(left_path)
right.with_row_index("_right_idx").sink_parquet(right_path)
# Collect only the narrow columns needed for matching (projection pushdown).
left_match = (
pl.scan_parquet(left_path)
.select(
'_left_idx',
_normalize(pl.col(left_address_col)).alias('_left_address'),
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
"_left_idx",
_normalize(pl.col(left_address_col)).alias("_left_address"),
pl.col(left_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_left_postcode"),
)
.collect()
)
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
right_match = (
pl.scan_parquet(right_path)
.select(
'_right_idx',
_normalize(pl.col(right_address_col)).alias('_right_address'),
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
"_right_idx",
_normalize(pl.col(right_address_col)).alias("_right_address"),
pl.col(right_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_right_postcode"),
)
.unique(subset=['_right_address', '_right_postcode'], keep='first')
.unique(subset=["_right_address", "_right_postcode"], keep="first")
.collect()
)
# Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
right_match["_right_idx"],
right_match["_right_postcode"],
right_match["_right_address"],
):
if postcode is not None:
right_by_postcode.setdefault(postcode, []).append((idx, address))
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
# Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
left_match["_left_idx"],
left_match["_left_postcode"],
left_match["_left_address"],
):
if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append((idx, address))
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc='Fuzzy matching',
desc="Fuzzy matching",
):
all_pairs.extend(pairs)
@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(
# Build a small mapping LazyFrame and join back to the cached parquets.
if matches:
mapping = pl.LazyFrame({
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
})
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
}
)
else:
mapping = pl.LazyFrame({
'_left_idx': pl.Series([], dtype=pl.UInt32),
'_right_idx': pl.Series([], dtype=pl.UInt32),
})
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([], dtype=pl.UInt32),
"_right_idx": pl.Series([], dtype=pl.UInt32),
}
)
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
return (
left_cached
.join(mapping, on='_left_idx', how='left')
.join(right_cached, on='_right_idx', how='left')
.drop('_left_idx', '_right_idx')
left_cached.join(mapping, on="_left_idx", how="left")
.join(right_cached, on="_right_idx", how="left")
.drop("_left_idx", "_right_idx")
)
except BaseException:
shutil.rmtree(tmpdir, ignore_errors=True)
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
"""
nums_a = set(_NUMBER_RE.findall(a))
nums_b = set(_NUMBER_RE.findall(b))
smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
smaller, larger = (
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
)
if not smaller and larger:
return False
return smaller.issubset(larger)