This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -154,14 +154,16 @@ def fuzzy_join_on_postcode(
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
return (
result = (
left_cached.join(mapping, on="_left_idx", how="left")
.join(right_cached, on="_right_idx", how="left")
.drop("_left_idx", "_right_idx")
.collect(engine="streaming")
)
except BaseException:
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
raise
return result.lazy()
def _numbers_compatible(a: str, b: str) -> bool:
@ -180,7 +182,7 @@ def _numbers_compatible(a: str, b: str) -> bool:
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries = args