Format python

2026-01-31 13:07:09 +00:00 · 2026-01-31 13:07:09 +00:00 · 4c258018c3
commit 4c258018c3
parent 85f5770e09
17 changed files with 348 additions and 248 deletions
--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -9,14 +9,14 @@ import polars as pl
 from thefuzz import fuzz
 from tqdm import tqdm

-_NUMBER_RE = re.compile(r'\d+')
+_NUMBER_RE = re.compile(r"\d+")


 def _normalize(s: pl.Expr) -> pl.Expr:
    return (
        s.str.to_uppercase()
-        .str.replace_all(r'[,.\-]', ' ')
-        .str.replace_all(r'\s+', ' ')
+        .str.replace_all(r"[,.\-]", " ")
+        .str.replace_all(r"\s+", " ")
        .str.strip_chars()
    )

@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
    have null right columns.
    """

-    tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
-    left_path = Path(tmpdir) / 'left.parquet'
-    right_path = Path(tmpdir) / 'right.parquet'
+    tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
+    left_path = Path(tmpdir) / "left.parquet"
+    right_path = Path(tmpdir) / "right.parquet"

    try:
        # Materialise each side exactly once, with a row index, to temp parquet.
-        left.with_row_index('_left_idx').sink_parquet(left_path)
-        right.with_row_index('_right_idx').sink_parquet(right_path)
+        left.with_row_index("_left_idx").sink_parquet(left_path)
+        right.with_row_index("_right_idx").sink_parquet(right_path)

        # Collect only the narrow columns needed for matching (projection pushdown).
        left_match = (
            pl.scan_parquet(left_path)
            .select(
-                '_left_idx',
-                _normalize(pl.col(left_address_col)).alias('_left_address'),
-                pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
+                "_left_idx",
+                _normalize(pl.col(left_address_col)).alias("_left_address"),
+                pl.col(left_postcode_col)
+                .str.strip_chars()
+                .str.to_uppercase()
+                .alias("_left_postcode"),
            )
            .collect()
        )
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
        right_match = (
            pl.scan_parquet(right_path)
            .select(
-                '_right_idx',
-                _normalize(pl.col(right_address_col)).alias('_right_address'),
-                pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
+                "_right_idx",
+                _normalize(pl.col(right_address_col)).alias("_right_address"),
+                pl.col(right_postcode_col)
+                .str.strip_chars()
+                .str.to_uppercase()
+                .alias("_right_postcode"),
            )
-            .unique(subset=['_right_address', '_right_postcode'], keep='first')
+            .unique(subset=["_right_address", "_right_postcode"], keep="first")
            .collect()
        )

        # Group right side by postcode for fast lookup
        right_by_postcode: dict[str, list[tuple[int, str]]] = {}
        for idx, postcode, address in zip(
-            right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
+            right_match["_right_idx"],
+            right_match["_right_postcode"],
+            right_match["_right_address"],
        ):
            if postcode is not None:
                right_by_postcode.setdefault(postcode, []).append((idx, address))
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
        # Group left side by postcode
        left_by_postcode: dict[str, list[tuple[int, str]]] = {}
        for idx, postcode, address in zip(
-            left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
+            left_match["_left_idx"],
+            left_match["_left_postcode"],
+            left_match["_left_address"],
        ):
            if address is not None and postcode is not None:
                left_by_postcode.setdefault(postcode, []).append((idx, address))
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
            for pairs in tqdm(
                executor.map(_score_bucket, tasks, chunksize=64),
                total=len(tasks),
-                desc='Fuzzy matching',
+                desc="Fuzzy matching",
            ):
                all_pairs.extend(pairs)

@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(

        # Build a small mapping LazyFrame and join back to the cached parquets.
        if matches:
-            mapping = pl.LazyFrame({
-                '_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
-                '_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
-            })
+            mapping = pl.LazyFrame(
+                {
+                    "_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
+                    "_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
+                }
+            )
        else:
-            mapping = pl.LazyFrame({
-                '_left_idx': pl.Series([], dtype=pl.UInt32),
-                '_right_idx': pl.Series([], dtype=pl.UInt32),
-            })
+            mapping = pl.LazyFrame(
+                {
+                    "_left_idx": pl.Series([], dtype=pl.UInt32),
+                    "_right_idx": pl.Series([], dtype=pl.UInt32),
+                }
+            )

        left_cached = pl.scan_parquet(left_path)
        right_cached = pl.scan_parquet(right_path)

        return (
-            left_cached
-            .join(mapping, on='_left_idx', how='left')
-            .join(right_cached, on='_right_idx', how='left')
-            .drop('_left_idx', '_right_idx')
+            left_cached.join(mapping, on="_left_idx", how="left")
+            .join(right_cached, on="_right_idx", how="left")
+            .drop("_left_idx", "_right_idx")
        )
    except BaseException:
        shutil.rmtree(tmpdir, ignore_errors=True)
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
    """
    nums_a = set(_NUMBER_RE.findall(a))
    nums_b = set(_NUMBER_RE.findall(b))
-    smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
+    smaller, larger = (
+        (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
+    )
    if not smaller and larger:
        return False
    return smaller.issubset(larger)