LGTM
This commit is contained in:
parent
a8165249a4
commit
a4103b0896
64 changed files with 5376 additions and 3832 deletions
|
|
@ -1,5 +1,9 @@
|
|||
from .download import download, extract_zip
|
||||
from .fuzzy_join import fuzzy_join_on_postcode
|
||||
from .fuzzy_join import (
|
||||
fuzzy_join_on_postcode,
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
)
|
||||
from .haversine import haversine_km, haversine_km_expr
|
||||
from .poi_counts import count_pois_per_postcode
|
||||
from .postcode_mapping import build_postcode_mapping
|
||||
|
|
@ -8,6 +12,8 @@ __all__ = [
|
|||
"download",
|
||||
"extract_zip",
|
||||
"fuzzy_join_on_postcode",
|
||||
"normalize_address_key",
|
||||
"normalize_postcode_key",
|
||||
"haversine_km",
|
||||
"haversine_km_expr",
|
||||
"count_pois_per_postcode",
|
||||
|
|
|
|||
|
|
@ -10,15 +10,31 @@ from thefuzz import fuzz
|
|||
from tqdm import tqdm
|
||||
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||
MIN_FUZZY_SCORE = 60
|
||||
|
||||
|
||||
def _normalize(s: pl.Expr) -> pl.Expr:
|
||||
return (
|
||||
s.str.to_uppercase()
|
||||
.str.replace_all(r"[,.\-]", " ")
|
||||
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
||||
normalized = (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^0-9A-Z]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)
|
||||
|
||||
|
||||
def normalize_postcode_key(s: pl.Expr) -> pl.Expr:
|
||||
normalized = (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^A-Z0-9]+", "")
|
||||
.str.strip_chars()
|
||||
)
|
||||
return (
|
||||
pl.when(normalized.str.contains(_POSTCODE_RE)).then(normalized).otherwise(None)
|
||||
)
|
||||
|
||||
|
||||
def fuzzy_join_on_postcode(
|
||||
|
|
@ -28,6 +44,7 @@ def fuzzy_join_on_postcode(
|
|||
right_address_col: str,
|
||||
left_postcode_col: str,
|
||||
right_postcode_col: str,
|
||||
min_score: int = MIN_FUZZY_SCORE,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
||||
|
||||
|
|
@ -54,11 +71,10 @@ def fuzzy_join_on_postcode(
|
|||
pl.scan_parquet(left_path)
|
||||
.select(
|
||||
"_left_idx",
|
||||
_normalize(pl.col(left_address_col)).alias("_left_address"),
|
||||
pl.col(left_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_left_postcode"),
|
||||
normalize_address_key(pl.col(left_address_col)).alias("_left_address"),
|
||||
normalize_postcode_key(pl.col(left_postcode_col)).alias(
|
||||
"_left_postcode"
|
||||
),
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
|
@ -67,11 +83,12 @@ def fuzzy_join_on_postcode(
|
|||
pl.scan_parquet(right_path)
|
||||
.select(
|
||||
"_right_idx",
|
||||
_normalize(pl.col(right_address_col)).alias("_right_address"),
|
||||
pl.col(right_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_right_postcode"),
|
||||
normalize_address_key(pl.col(right_address_col)).alias(
|
||||
"_right_address"
|
||||
),
|
||||
normalize_postcode_key(pl.col(right_postcode_col)).alias(
|
||||
"_right_postcode"
|
||||
),
|
||||
)
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect(engine="streaming")
|
||||
|
|
@ -101,7 +118,7 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Build tasks for each postcode bucket
|
||||
tasks = [
|
||||
(left_entries, right_by_postcode[postcode])
|
||||
(left_entries, right_by_postcode[postcode], min_score)
|
||||
for postcode, left_entries in left_by_postcode.items()
|
||||
if postcode in right_by_postcode
|
||||
]
|
||||
|
|
@ -182,15 +199,16 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
left_entries, right_entries = args
|
||||
left_entries, right_entries, min_score = args
|
||||
pairs = []
|
||||
for left_row, left_address in left_entries:
|
||||
for right_row, right_address in right_entries:
|
||||
if not _numbers_compatible(left_address, right_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(left_address, right_address)
|
||||
pairs.append((score, left_row, right_row))
|
||||
if score >= min_score:
|
||||
pairs.append((score, left_row, right_row))
|
||||
return pairs
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils import fuzzy_join_on_postcode
|
||||
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
|
|
@ -132,3 +132,22 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
|||
{"left_id": "number_only", "right_address": None},
|
||||
{"left_id": "valid", "right_address": "10 High Street"},
|
||||
]
|
||||
|
||||
|
||||
def test_normalize_postcode_key_requires_full_postcode():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"postcode": [
|
||||
" SW1A 1AA ",
|
||||
"sw1a-1aa",
|
||||
"",
|
||||
"SW1A",
|
||||
"12345",
|
||||
"not a postcode",
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key"))
|
||||
|
||||
assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue