Format python
This commit is contained in:
parent
85f5770e09
commit
4c258018c3
17 changed files with 348 additions and 248 deletions
|
|
@ -9,14 +9,14 @@ import polars as pl
|
|||
from thefuzz import fuzz
|
||||
from tqdm import tqdm
|
||||
|
||||
_NUMBER_RE = re.compile(r'\d+')
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
|
||||
|
||||
def _normalize(s: pl.Expr) -> pl.Expr:
|
||||
return (
|
||||
s.str.to_uppercase()
|
||||
.str.replace_all(r'[,.\-]', ' ')
|
||||
.str.replace_all(r'\s+', ' ')
|
||||
.str.replace_all(r"[,.\-]", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
|
||||
|
|
@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
|
|||
have null right columns.
|
||||
"""
|
||||
|
||||
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
|
||||
left_path = Path(tmpdir) / 'left.parquet'
|
||||
right_path = Path(tmpdir) / 'right.parquet'
|
||||
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
|
||||
left_path = Path(tmpdir) / "left.parquet"
|
||||
right_path = Path(tmpdir) / "right.parquet"
|
||||
|
||||
try:
|
||||
# Materialise each side exactly once, with a row index, to temp parquet.
|
||||
left.with_row_index('_left_idx').sink_parquet(left_path)
|
||||
right.with_row_index('_right_idx').sink_parquet(right_path)
|
||||
left.with_row_index("_left_idx").sink_parquet(left_path)
|
||||
right.with_row_index("_right_idx").sink_parquet(right_path)
|
||||
|
||||
# Collect only the narrow columns needed for matching (projection pushdown).
|
||||
left_match = (
|
||||
pl.scan_parquet(left_path)
|
||||
.select(
|
||||
'_left_idx',
|
||||
_normalize(pl.col(left_address_col)).alias('_left_address'),
|
||||
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
|
||||
"_left_idx",
|
||||
_normalize(pl.col(left_address_col)).alias("_left_address"),
|
||||
pl.col(left_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_left_postcode"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
|
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
|
|||
right_match = (
|
||||
pl.scan_parquet(right_path)
|
||||
.select(
|
||||
'_right_idx',
|
||||
_normalize(pl.col(right_address_col)).alias('_right_address'),
|
||||
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
|
||||
"_right_idx",
|
||||
_normalize(pl.col(right_address_col)).alias("_right_address"),
|
||||
pl.col(right_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_right_postcode"),
|
||||
)
|
||||
.unique(subset=['_right_address', '_right_postcode'], keep='first')
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect()
|
||||
)
|
||||
|
||||
# Group right side by postcode for fast lookup
|
||||
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
|
||||
right_match["_right_idx"],
|
||||
right_match["_right_postcode"],
|
||||
right_match["_right_address"],
|
||||
):
|
||||
if postcode is not None:
|
||||
right_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
|
|
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
|
|||
# Group left side by postcode
|
||||
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
|
||||
left_match["_left_idx"],
|
||||
left_match["_left_postcode"],
|
||||
left_match["_left_address"],
|
||||
):
|
||||
if address is not None and postcode is not None:
|
||||
left_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
|
|
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
|
|||
for pairs in tqdm(
|
||||
executor.map(_score_bucket, tasks, chunksize=64),
|
||||
total=len(tasks),
|
||||
desc='Fuzzy matching',
|
||||
desc="Fuzzy matching",
|
||||
):
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
|
|
@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Build a small mapping LazyFrame and join back to the cached parquets.
|
||||
if matches:
|
||||
mapping = pl.LazyFrame({
|
||||
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||
})
|
||||
mapping = pl.LazyFrame(
|
||||
{
|
||||
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||
}
|
||||
)
|
||||
else:
|
||||
mapping = pl.LazyFrame({
|
||||
'_left_idx': pl.Series([], dtype=pl.UInt32),
|
||||
'_right_idx': pl.Series([], dtype=pl.UInt32),
|
||||
})
|
||||
mapping = pl.LazyFrame(
|
||||
{
|
||||
"_left_idx": pl.Series([], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([], dtype=pl.UInt32),
|
||||
}
|
||||
)
|
||||
|
||||
left_cached = pl.scan_parquet(left_path)
|
||||
right_cached = pl.scan_parquet(right_path)
|
||||
|
||||
return (
|
||||
left_cached
|
||||
.join(mapping, on='_left_idx', how='left')
|
||||
.join(right_cached, on='_right_idx', how='left')
|
||||
.drop('_left_idx', '_right_idx')
|
||||
left_cached.join(mapping, on="_left_idx", how="left")
|
||||
.join(right_cached, on="_right_idx", how="left")
|
||||
.drop("_left_idx", "_right_idx")
|
||||
)
|
||||
except BaseException:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
|
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
"""
|
||||
nums_a = set(_NUMBER_RE.findall(a))
|
||||
nums_b = set(_NUMBER_RE.findall(b))
|
||||
smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||
smaller, larger = (
|
||||
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||
)
|
||||
if not smaller and larger:
|
||||
return False
|
||||
return smaller.issubset(larger)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue