SPlit up
This commit is contained in:
parent
cf39ad754e
commit
f59d01227b
91 changed files with 10370 additions and 7562 deletions
|
|
@ -1,4 +1,10 @@
|
|||
from .download import download, extract_zip
|
||||
from .download import (
|
||||
ENGLAND_LSOA_COUNT_2021,
|
||||
download,
|
||||
download_arcgis_hub_export,
|
||||
download_nomis_csv,
|
||||
extract_zip,
|
||||
)
|
||||
from .fuzzy_join import (
|
||||
fuzzy_join_on_postcode,
|
||||
normalize_address_key,
|
||||
|
|
@ -10,7 +16,10 @@ from .poi_counts import count_pois_per_postcode
|
|||
from .postcode_mapping import build_postcode_mapping
|
||||
|
||||
__all__ = [
|
||||
"ENGLAND_LSOA_COUNT_2021",
|
||||
"download",
|
||||
"download_arcgis_hub_export",
|
||||
"download_nomis_csv",
|
||||
"extract_zip",
|
||||
"fuzzy_join_on_postcode",
|
||||
"normalize_address_key",
|
||||
|
|
|
|||
|
|
@ -1,11 +1,19 @@
|
|||
"""Shared download and extraction helpers for pipeline scripts."""
|
||||
|
||||
import time
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
# Census 2021 LSOAs (TYPE151) with an E prefix. The Census 2021 geography is
|
||||
# frozen, so NOMIS England-level downloads must yield exactly this many LSOAs;
|
||||
# fewer means the download was truncated.
|
||||
ENGLAND_LSOA_COUNT_2021 = 33_755
|
||||
|
||||
|
||||
def download(url: str, output_path: Path, *, timeout: float = 120) -> None:
|
||||
"""Stream-download a URL to a local file with a tqdm progress bar."""
|
||||
|
|
@ -38,3 +46,86 @@ def extract_zip(zip_path: Path, extract_dir: Path) -> None:
|
|||
extract_dir.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
zf.extractall(extract_dir)
|
||||
|
||||
|
||||
def download_nomis_csv(base_url: str, *, page_size: int = 25_000) -> pl.DataFrame:
|
||||
"""Download a NOMIS CSV dataset, paging with recordoffset/RecordLimit.
|
||||
|
||||
The page size is sent explicitly as ``RecordLimit``: last-page detection is
|
||||
``rows < page_size``, so relying on NOMIS's implicit default would silently
|
||||
truncate the dataset to one page if that default ever differed.
|
||||
"""
|
||||
frames = []
|
||||
offset = 0
|
||||
while True:
|
||||
url = f"{base_url}&RecordLimit={page_size}&recordoffset={offset}"
|
||||
response = httpx.get(url, follow_redirects=True, timeout=120)
|
||||
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
|
||||
if len(response.content) == 0:
|
||||
break
|
||||
chunk = pl.read_csv(BytesIO(response.content))
|
||||
if chunk.height == 0:
|
||||
break
|
||||
frames.append(chunk)
|
||||
print(f" Fetched {chunk.height} rows (offset={offset})")
|
||||
if chunk.height < page_size:
|
||||
break
|
||||
offset += page_size
|
||||
|
||||
if not frames:
|
||||
raise RuntimeError(f"NOMIS returned no rows for {base_url}")
|
||||
return pl.concat(frames)
|
||||
|
||||
|
||||
def download_arcgis_hub_export(
|
||||
url: str,
|
||||
output_path: Path,
|
||||
*,
|
||||
expected_geometry: str | None = None,
|
||||
poll_interval_s: float = 5,
|
||||
poll_timeout_s: float = 600,
|
||||
) -> int:
|
||||
"""Download an ArcGIS Hub `api/download/v1` export, handling deferred jobs.
|
||||
|
||||
The endpoint returns HTTP 202 with a JSON status body while the export is
|
||||
still being prepared; a plain download would save that placeholder as the
|
||||
output file with a success exit code. Poll until the file is ready, then
|
||||
validate the result with pyogrio (feature count > 0 and, optionally, a
|
||||
geometry-type substring) before moving it into place. Returns the feature
|
||||
count.
|
||||
"""
|
||||
import pyogrio
|
||||
|
||||
tmp_path = output_path.with_name(f"{output_path.stem}.tmp{output_path.suffix}")
|
||||
deadline = time.monotonic() + poll_timeout_s
|
||||
with httpx.Client(follow_redirects=True, timeout=300) as client:
|
||||
while True:
|
||||
with client.stream("GET", url) as response:
|
||||
if response.status_code == 202:
|
||||
response.read()
|
||||
if time.monotonic() > deadline:
|
||||
raise TimeoutError(
|
||||
f"Export did not finish within {poll_timeout_s}s: "
|
||||
f"{response.text}"
|
||||
)
|
||||
time.sleep(poll_interval_s)
|
||||
continue
|
||||
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
|
||||
with tmp_path.open("wb") as fh:
|
||||
for chunk in response.iter_bytes():
|
||||
fh.write(chunk)
|
||||
break
|
||||
|
||||
info = pyogrio.read_info(tmp_path)
|
||||
features = int(info.get("features", 0))
|
||||
geometry_type = str(info.get("geometry_type") or "")
|
||||
if features <= 0:
|
||||
raise ValueError(f"Downloaded file {output_path.name} contains no features")
|
||||
if expected_geometry is not None and expected_geometry not in geometry_type:
|
||||
raise ValueError(
|
||||
f"Expected {expected_geometry!r} geometry in {output_path.name}, "
|
||||
f"got {geometry_type!r}"
|
||||
)
|
||||
|
||||
tmp_path.replace(output_path)
|
||||
return features
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import Counter
|
||||
from collections.abc import Sequence
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from os import cpu_count
|
||||
from pathlib import Path
|
||||
|
|
@ -10,6 +12,7 @@ from thefuzz import fuzz
|
|||
from tqdm import tqdm
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.utils.normalize import uppercase_alnum_key_expr
|
||||
|
||||
# A house-number token includes any letter suffix: 8A, 8B and plain 8 are
|
||||
# three different properties on the same street, so digit-only extraction
|
||||
|
|
@ -17,6 +20,10 @@ from pipeline.local_temp import local_tmp_dir
|
|||
# through normalize_address_key first, so tokens are uppercase and
|
||||
# space-separated and [A-Z] suffices for the suffix.
|
||||
_NUMBER_RE = re.compile(r"\d+[A-Z]?")
|
||||
# A single-letter flat designator ("FLAT B", "APARTMENT C") is a house-number-
|
||||
# grade disambiguator with no digit in it: without this, FLAT B and FLAT D in
|
||||
# the same building scored ~96 and cross-matched.
|
||||
_FLAT_LETTER_RE = re.compile(r"\b(?:FLAT|APARTMENT|APT|UNIT) ([A-Z])\b")
|
||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||
# A house number is a strong disambiguator, so a numbered, number-compatible
|
||||
# pair may match on a lower address-similarity score than a number-less one
|
||||
|
|
@ -24,16 +31,30 @@ _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
|||
# be trusted. Mirrors merge.py's listings convention.
|
||||
MIN_FUZZY_SCORE = 82
|
||||
MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
|
||||
# A score reached only through an address VARIANT (locality appended /
|
||||
# secondary address lines dropped) accepts a match the primary strings alone
|
||||
# would reject, so it must clear a near-exact bar: in the miss audit >99% of
|
||||
# genuine variant recoveries scored 100, while the rare false variant matches
|
||||
# scored in the 80s.
|
||||
MIN_VARIANT_SCORE = 90
|
||||
|
||||
# Tokens that mark a sub-unit of a building. A variant whose added/dropped
|
||||
# tokens include one of these could score a single flat's certificate as if it
|
||||
# were the whole building, so such variants are inadmissible.
|
||||
_FLAT_TOKENS = {
|
||||
"FLAT",
|
||||
"FLATS",
|
||||
"APARTMENT",
|
||||
"APT",
|
||||
"UNIT",
|
||||
"MAISONETTE",
|
||||
"STUDIO",
|
||||
"ROOM",
|
||||
}
|
||||
|
||||
|
||||
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
||||
normalized = (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^0-9A-Z]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
normalized = uppercase_alnum_key_expr(s)
|
||||
return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)
|
||||
|
||||
|
||||
|
|
@ -58,6 +79,8 @@ def fuzzy_join_on_postcode(
|
|||
right_postcode_col: str,
|
||||
min_score: int = MIN_FUZZY_SCORE,
|
||||
min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
|
||||
left_variant_cols: Sequence[str] = (),
|
||||
right_variant_cols: Sequence[str] = (),
|
||||
) -> pl.LazyFrame:
|
||||
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
||||
|
||||
|
|
@ -66,6 +89,19 @@ def fuzzy_join_on_postcode(
|
|||
columns (index, address, postcode) via projection pushdown, and the
|
||||
final join reads the remaining columns lazily.
|
||||
|
||||
``left_variant_cols`` / ``right_variant_cols`` name alternative address
|
||||
columns for the same property (e.g. the EPC's first address line without
|
||||
its locality suffix, or the price-paid address with its locality
|
||||
appended). A pair is scored as the best token_sort_ratio over all
|
||||
admissible variant combinations: source registers frequently disagree
|
||||
only on a trailing village/locality token, which alone drags short
|
||||
addresses below the match threshold. The number-compatibility gate is
|
||||
always evaluated on the primary addresses, and `_admissible_variants`
|
||||
rejects any variant whose added/dropped tokens carry digits or flat
|
||||
designators, so a variant can never bypass the gate or score a single
|
||||
flat as its whole building. Variant-only scores must clear
|
||||
``MIN_VARIANT_SCORE``.
|
||||
|
||||
Returns a LazyFrame with all left and right columns, plus a
|
||||
``_match_score`` (UInt8) audit column holding the token_sort_ratio of
|
||||
the accepted match (exact matches score 100). Unmatched rows have null
|
||||
|
|
@ -90,6 +126,10 @@ def fuzzy_join_on_postcode(
|
|||
normalize_postcode_key(pl.col(left_postcode_col)).alias(
|
||||
"_left_postcode"
|
||||
),
|
||||
*(
|
||||
normalize_address_key(pl.col(col)).alias(f"_left_variant_{i}")
|
||||
for i, col in enumerate(left_variant_cols)
|
||||
),
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
|
@ -104,30 +144,45 @@ def fuzzy_join_on_postcode(
|
|||
normalize_postcode_key(pl.col(right_postcode_col)).alias(
|
||||
"_right_postcode"
|
||||
),
|
||||
*(
|
||||
normalize_address_key(pl.col(col)).alias(f"_right_variant_{i}")
|
||||
for i, col in enumerate(right_variant_cols)
|
||||
),
|
||||
)
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
left_variant_names = [f"_left_variant_{i}" for i in range(len(left_variant_cols))]
|
||||
right_variant_names = [
|
||||
f"_right_variant_{i}" for i in range(len(right_variant_cols))
|
||||
]
|
||||
|
||||
# Group right side by postcode for fast lookup
|
||||
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
right_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
|
||||
for idx, postcode, address, *variants in zip(
|
||||
right_match["_right_idx"],
|
||||
right_match["_right_postcode"],
|
||||
right_match["_right_address"],
|
||||
*(right_match[name] for name in right_variant_names),
|
||||
):
|
||||
if address is not None and postcode is not None:
|
||||
right_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
right_by_postcode.setdefault(postcode, []).append(
|
||||
(idx, address, _admissible_variants(address, variants))
|
||||
)
|
||||
|
||||
# Group left side by postcode
|
||||
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
left_by_postcode: dict[str, list[tuple[int, str, tuple[str, ...]]]] = {}
|
||||
for idx, postcode, address, *variants in zip(
|
||||
left_match["_left_idx"],
|
||||
left_match["_left_postcode"],
|
||||
left_match["_left_address"],
|
||||
*(left_match[name] for name in left_variant_names),
|
||||
):
|
||||
if address is not None and postcode is not None:
|
||||
left_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
left_by_postcode.setdefault(postcode, []).append(
|
||||
(idx, address, _admissible_variants(address, variants))
|
||||
)
|
||||
|
||||
del left_match, right_match
|
||||
|
||||
|
|
@ -145,7 +200,12 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Score all pairwise matches in parallel, then greedily assign from
|
||||
# highest score downward so best pairs lock in first.
|
||||
all_pairs: list[tuple[int, int, int]] = [] # (score, left_idx, right_idx)
|
||||
# Pair tuples are (score, exact, left_idx, right_idx); `exact` marks a
|
||||
# literally-equal primary pair so it wins greedy ties against a pair
|
||||
# that merely token-sorts to the same score (e.g. "APARTMENT 3 1 HIGH
|
||||
# ST" vs "APARTMENT 1 3 HIGH ST" both score 100 against each other's
|
||||
# certificates, but each has a literal twin).
|
||||
all_pairs: list[tuple[int, int, int, int]] = []
|
||||
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
|
||||
for pairs in tqdm(
|
||||
executor.map(_score_bucket, tasks, chunksize=64),
|
||||
|
|
@ -156,8 +216,9 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
del tasks, left_by_postcode, right_by_postcode
|
||||
|
||||
# Sort descending by score so best matches are assigned first
|
||||
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
|
||||
# Sort so the best matches are assigned first: score, then literal
|
||||
# equality, then stable left-index order.
|
||||
all_pairs.sort(key=lambda t: (t[0], t[1], -t[2]), reverse=True)
|
||||
|
||||
# Keep the score alongside each accepted pair: it is emitted as the
|
||||
# _match_score audit column so downstream consumers can distinguish
|
||||
|
|
@ -166,7 +227,7 @@ def fuzzy_join_on_postcode(
|
|||
matched_left: set[int] = set()
|
||||
matched_right: set[int] = set()
|
||||
|
||||
for score, left_idx, right_idx in all_pairs:
|
||||
for score, _exact, left_idx, right_idx in all_pairs:
|
||||
if left_idx in matched_left or right_idx in matched_right:
|
||||
continue
|
||||
matches.append((left_idx, right_idx, score))
|
||||
|
|
@ -208,40 +269,102 @@ def fuzzy_join_on_postcode(
|
|||
return result.lazy()
|
||||
|
||||
|
||||
def _number_tokens(address: str) -> set[str]:
|
||||
tokens = set(_NUMBER_RE.findall(address))
|
||||
tokens.update(_FLAT_LETTER_RE.findall(address))
|
||||
return tokens
|
||||
|
||||
|
||||
def _numbers_compatible(a: str, b: str) -> bool:
|
||||
"""Check that the number tokens (house/flat numbers, including any letter
|
||||
suffix) of two addresses are IDENTICAL sets.
|
||||
suffix, plus single-letter flat designators) of two addresses are
|
||||
IDENTICAL sets.
|
||||
|
||||
Equality, not subset: subset logic let "188 GREAT NORTH WAY" absorb
|
||||
"FLAT 1 188 GREAT NORTH WAY" ({188} is a subset of {1, 188}), attaching a
|
||||
single flat's EPC facts to the whole building — tens of thousands of
|
||||
wrong-property matches. Likewise digit-only tokens made "8A" and "8B"
|
||||
both look like {8} and match each other (and plain "8"). Precision over
|
||||
recall: a pair whose two sources genuinely disagree on number tokens is
|
||||
safer left unmatched.
|
||||
both look like {8} and match each other (and plain "8"), and ungated
|
||||
letter flats let "FLAT D 39 X ST" cross-match "FLAT F 39 X ST" at ~96.
|
||||
Precision over recall: a pair whose two sources genuinely disagree on
|
||||
number tokens is safer left unmatched.
|
||||
|
||||
One side numbered, the other not -> incompatible. Neither numbered ->
|
||||
compatible; such pairs are scored against the stricter no-numbers
|
||||
threshold instead.
|
||||
"""
|
||||
nums_a = set(_NUMBER_RE.findall(a))
|
||||
nums_b = set(_NUMBER_RE.findall(b))
|
||||
nums_a = _number_tokens(a)
|
||||
nums_b = _number_tokens(b)
|
||||
if not nums_a and not nums_b:
|
||||
return True
|
||||
return nums_a == nums_b
|
||||
|
||||
|
||||
def _admissible_variants(
|
||||
primary: str, variants: Sequence[str | None]
|
||||
) -> tuple[str, ...]:
|
||||
"""Variants of ``primary`` that are safe to score against the other side.
|
||||
|
||||
A variant may only ADD or DROP whole tokens relative to the primary (one
|
||||
word multiset must contain the other) — never substitute, so a register
|
||||
row whose address lines disagree with the combined address can't smuggle
|
||||
in a different street. The number gate runs on the primary addresses
|
||||
only, so the added/dropped tokens must additionally carry no digits
|
||||
(house numbers) and no flat designator (a "Flat 1"-style secondary line
|
||||
dropped from an EPC address would otherwise let a single flat score as
|
||||
the whole building). The remaining admissible difference is exactly the
|
||||
harmless kind variants exist for: trailing locality/village/town words.
|
||||
"""
|
||||
primary_words = Counter(primary.split())
|
||||
admissible: list[str] = []
|
||||
for variant in variants:
|
||||
if not variant or variant == primary:
|
||||
continue
|
||||
variant_words = Counter(variant.split())
|
||||
if not (variant_words <= primary_words or primary_words <= variant_words):
|
||||
continue
|
||||
changed = (primary_words - variant_words) + (variant_words - primary_words)
|
||||
if any(
|
||||
any(ch.isdigit() for ch in token) or token in _FLAT_TOKENS
|
||||
for token in changed
|
||||
):
|
||||
continue
|
||||
admissible.append(variant)
|
||||
return tuple(dict.fromkeys(admissible))
|
||||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
args: tuple[
|
||||
list[tuple[int, str, tuple[str, ...]]],
|
||||
list[tuple[int, str, tuple[str, ...]]],
|
||||
int,
|
||||
int,
|
||||
],
|
||||
) -> list[tuple[int, int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
left_entries, right_entries, min_score, min_score_without_numbers = args
|
||||
pairs = []
|
||||
for left_row, left_address in left_entries:
|
||||
for right_row, right_address in right_entries:
|
||||
for left_row, left_address, left_variants in left_entries:
|
||||
for right_row, right_address, right_variants in right_entries:
|
||||
if not _numbers_compatible(left_address, right_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(left_address, right_address)
|
||||
# Variant pairs recover same-property matches where one register
|
||||
# carries a locality suffix the other lacks; a variant-only score
|
||||
# must clear the near-exact MIN_VARIANT_SCORE bar.
|
||||
if score < 100 and (left_variants or right_variants):
|
||||
for left_variant in (left_address, *left_variants):
|
||||
for right_variant in (right_address, *right_variants):
|
||||
if (
|
||||
left_variant is left_address
|
||||
and right_variant is right_address
|
||||
):
|
||||
continue
|
||||
variant_score = fuzz.token_sort_ratio(
|
||||
left_variant, right_variant
|
||||
)
|
||||
if variant_score >= MIN_VARIANT_SCORE and variant_score > score:
|
||||
score = variant_score
|
||||
# Number-less pairs (named houses, building-name flats) lack the
|
||||
# house-number disambiguator, so require a near-exact match.
|
||||
threshold = (
|
||||
|
|
@ -250,5 +373,7 @@ def _score_bucket(
|
|||
else min_score_without_numbers
|
||||
)
|
||||
if score >= threshold:
|
||||
pairs.append((score, left_row, right_row))
|
||||
pairs.append(
|
||||
(score, int(left_address == right_address), left_row, right_row)
|
||||
)
|
||||
return pairs
|
||||
|
|
|
|||
70
pipeline/utils/normalize.py
Normal file
70
pipeline/utils/normalize.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""Shared low-level text-normalization primitives.
|
||||
|
||||
Address matching (``pipeline.utils.fuzzy_join``, ``pipeline.transform.merge``),
|
||||
POI retailer cleanup (``pipeline.transform.transform_poi``) and school-name
|
||||
matching (``pipeline.check_school_cutoffs``) each layer domain-specific rules
|
||||
on top of these. The primitives are deliberately tiny and single-purpose so
|
||||
that composing them preserves every caller's existing output byte-for-byte.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import polars as pl
|
||||
|
||||
# One character outside [a-z0-9 ]. Callers lowercase first; each offending
|
||||
# character becomes a single space (runs are NOT merged here — callers apply
|
||||
# word-level rules and then collapse_whitespace).
|
||||
_NON_ALNUM_LOWER_RE = re.compile(r"[^a-z0-9 ]")
|
||||
|
||||
# Any digit marks a token as number-bearing (house/flat numbers, including
|
||||
# letter-suffixed forms such as 8A, which still contain a digit).
|
||||
_DIGIT_RE = re.compile(r"\d")
|
||||
|
||||
|
||||
def collapse_whitespace(s: str) -> str:
|
||||
"""Collapse every whitespace run to a single space and strip the ends."""
|
||||
return " ".join(s.split())
|
||||
|
||||
|
||||
def strip_or_empty(s: str | None) -> str:
|
||||
"""Strip leading/trailing whitespace, mapping None to ``""``.
|
||||
|
||||
Interior whitespace is preserved (unlike :func:`collapse_whitespace`) so
|
||||
the result can be looked up verbatim against curated dictionary keys.
|
||||
"""
|
||||
return "" if s is None else s.strip()
|
||||
|
||||
|
||||
def replace_non_alnum_lower(s: str) -> str:
|
||||
"""Replace each character outside [a-z0-9 ] with a single space.
|
||||
|
||||
Expects already-lowercased input (uppercase letters are replaced too).
|
||||
Replacement is per character, not per run; callers collapse whitespace
|
||||
afterwards.
|
||||
"""
|
||||
return _NON_ALNUM_LOWER_RE.sub(" ", s)
|
||||
|
||||
|
||||
def drop_digit_tokens(s: str) -> str:
|
||||
"""Drop whitespace-separated tokens that contain any digit.
|
||||
|
||||
``"10A HIGH STREET" -> "HIGH STREET"``. The surviving tokens are rejoined
|
||||
with single spaces, so whitespace collapses as a side effect.
|
||||
"""
|
||||
return " ".join(token for token in s.split() if not _DIGIT_RE.search(token))
|
||||
|
||||
|
||||
def uppercase_alnum_key_expr(s: pl.Expr) -> pl.Expr:
|
||||
"""Polars expression: uppercase, replace each non-alphanumeric run with a
|
||||
single space, collapse whitespace, and strip the ends.
|
||||
|
||||
Non-ASCII letters fall outside [0-9A-Z] after uppercasing and become
|
||||
spaces (``"Café 1" -> "CAF 1"``).
|
||||
"""
|
||||
return (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^0-9A-Z]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
|
||||
from pipeline.utils.fuzzy_join import _numbers_compatible
|
||||
from pipeline.utils.fuzzy_join import _admissible_variants, _numbers_compatible
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
|
|
@ -165,7 +165,7 @@ def test_fuzzy_join_rejects_mid_score_number_less_match():
|
|||
|
||||
|
||||
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
||||
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
|
||||
# "10 ACACIA AVENUE" vs "10 ACACIA AVENUE OAKHAM" scores exactly 82 and the
|
||||
# house number is compatible, so the numbered baseline (>= 82) still matches.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
|
|
@ -175,7 +175,7 @@ def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
|||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["Flat A, 10 Acacia Avenue"],
|
||||
"right_address": ["10 Acacia Avenue, Oakham"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
|
@ -189,7 +189,7 @@ def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
|||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
|
||||
assert result["right_address"].to_list() == ["10 Acacia Avenue, Oakham"]
|
||||
|
||||
|
||||
def test_fuzzy_join_matches_high_score_number_less_pair():
|
||||
|
|
@ -244,6 +244,151 @@ def test_numbers_compatible_number_less_and_one_sided_pairs():
|
|||
assert not _numbers_compatible("ROSE COTTAGE", "8 HIGH STREET")
|
||||
|
||||
|
||||
def test_numbers_compatible_gates_single_letter_flats():
|
||||
# "FLAT D" and "FLAT F" are different flats even with identical street
|
||||
# numbers; ungated they token_sort to ~96 and cross-matched. The letter is
|
||||
# a pseudo-number token, so it also blocks a flat matching the bare
|
||||
# building address.
|
||||
assert not _numbers_compatible(
|
||||
"FLAT D 39 GERTRUDE STREET", "FLAT F 39 GERTRUDE STREET"
|
||||
)
|
||||
assert _numbers_compatible(
|
||||
"FLAT D 39 GERTRUDE STREET", "39 GERTRUDE STREET FLAT D"
|
||||
)
|
||||
assert not _numbers_compatible("FLAT B ROSE COURT", "ROSE COURT")
|
||||
# A letter glued to a number ("A3") is a unit name, not a flat letter.
|
||||
assert _numbers_compatible("FLAT A3 CHESHAM HEIGHTS", "FLAT A3 CHESHAM HEIGHTS")
|
||||
|
||||
|
||||
def test_admissible_variants_allows_locality_suffix_only():
|
||||
# Locality words may differ between a variant and its primary; digits and
|
||||
# flat designators may not (the gate ran on the primary only).
|
||||
assert _admissible_variants(
|
||||
"12 OAK ROAD", ["12 OAK ROAD HALE", "12 OAK ROAD"]
|
||||
) == ("12 OAK ROAD HALE",)
|
||||
# Dropping "FLAT 1" (digit) or "FLAT B" (flat designator) is inadmissible:
|
||||
# the variant would score a single flat as the whole building.
|
||||
assert (
|
||||
_admissible_variants("FLAT 1 188 GREAT NORTH WAY", ["188 GREAT NORTH WAY"])
|
||||
== ()
|
||||
)
|
||||
assert _admissible_variants("FLAT B ROSE COURT", ["ROSE COURT"]) == ()
|
||||
assert _admissible_variants("12 OAK ROAD", [None, "12 OAK ROAD"]) == ()
|
||||
# Substitution is never admissible: a register row whose address1
|
||||
# disagrees with the combined address must not smuggle in a different
|
||||
# street for scoring.
|
||||
assert _admissible_variants("12 OAK ROAD", ["12 ELM ROAD"]) == ()
|
||||
assert (
|
||||
_admissible_variants("1 TOTALLY DIFFERENT ROAD", ["1 EXAMPLE STREET"]) == ()
|
||||
)
|
||||
|
||||
|
||||
def test_fuzzy_join_variant_recovers_locality_suffix_mismatch():
|
||||
# The EPC register stores "12 Oak Road, Hale" (address1 + locality line)
|
||||
# while price-paid has the bare "12 Oak Road": token_sort scores 81 < 82
|
||||
# and the match was lost. The EPC's address1-only variant scores 100.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["12 Oak Road"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
"left_with_locality": ["12 Oak Road Hale"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["12 Oak Road, Hale"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
"right_address1": ["12 Oak Road"],
|
||||
}
|
||||
)
|
||||
|
||||
unmatched = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
assert unmatched["_match_score"].to_list() == [None]
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
left_variant_cols=["left_with_locality"],
|
||||
right_variant_cols=["right_address1"],
|
||||
).collect()
|
||||
assert result["_match_score"].to_list() == [100]
|
||||
|
||||
|
||||
def test_fuzzy_join_variant_cannot_unlock_a_flat_for_its_building():
|
||||
# The EPC's secondary line carries the flat designator; dropping it would
|
||||
# score the flat's certificate 100 against the whole-building price-paid
|
||||
# address. The variant must be ruled inadmissible and the pair unmatched.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["188 Great North Way"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["Flat 1, 188 Great North Way"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
"right_address1": ["188 Great North Way"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
right_variant_cols=["right_address1"],
|
||||
).collect()
|
||||
|
||||
assert result["_match_score"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_variant_score_must_be_near_exact():
|
||||
# A score reached only through a variant must clear MIN_VARIANT_SCORE
|
||||
# (90): "2 MYRTLE COTTAGES" vs "2 LEITH VIEW COTTAGES" type pairs scored
|
||||
# in the 80s via variants and were false matches.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["2 Myrtle Cottages"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
"left_with_locality": ["2 Myrtle Cottages Dorking"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["2 Leith View Cottages, North Holmwood"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
"right_address1": ["2 Leith View Cottages"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
left_variant_cols=["left_with_locality"],
|
||||
right_variant_cols=["right_address1"],
|
||||
).collect()
|
||||
|
||||
assert result["_match_score"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_rejects_wrong_letter_suffix_match():
|
||||
# End-to-end guard for the 8A/8B class of wrong-property matches: the only
|
||||
# candidate in the postcode bucket differs solely in the number suffix, so
|
||||
|
|
@ -294,7 +439,7 @@ def test_fuzzy_join_emits_match_score_column():
|
|||
"10 HIGH STREET",
|
||||
# Scores exactly 82 against "10 Acacia Avenue" (see
|
||||
# test_fuzzy_join_matches_numbered_pair_at_baseline_threshold).
|
||||
"Flat A, 10 Acacia Avenue",
|
||||
"10 Acacia Avenue, Oakham",
|
||||
],
|
||||
"right_postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
}
|
||||
|
|
|
|||
158
pipeline/utils/test_normalize.py
Normal file
158
pipeline/utils/test_normalize.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.check_school_cutoffs import normalize_la, normalize_name
|
||||
from pipeline.transform.merge import _street_only_address
|
||||
from pipeline.transform.transform_poi import normalize_grocery_retailer
|
||||
from pipeline.utils.fuzzy_join import normalize_address_key
|
||||
from pipeline.utils.normalize import (
|
||||
collapse_whitespace,
|
||||
drop_digit_tokens,
|
||||
replace_non_alnum_lower,
|
||||
strip_or_empty,
|
||||
uppercase_alnum_key_expr,
|
||||
)
|
||||
|
||||
# --- Primitives -------------------------------------------------------------
|
||||
|
||||
|
||||
def test_collapse_whitespace():
|
||||
assert collapse_whitespace("") == ""
|
||||
assert collapse_whitespace(" ") == ""
|
||||
assert collapse_whitespace("a b") == "a b"
|
||||
assert collapse_whitespace(" a \t b \n c ") == "a b c"
|
||||
# str.split() also splits on unicode whitespace (non-breaking space).
|
||||
assert collapse_whitespace("a\u00a0b") == "a b"
|
||||
|
||||
|
||||
def test_strip_or_empty():
|
||||
assert strip_or_empty(None) == ""
|
||||
assert strip_or_empty("") == ""
|
||||
assert strip_or_empty(" x ") == "x"
|
||||
# Interior whitespace is preserved, unlike collapse_whitespace.
|
||||
assert strip_or_empty(" a b ") == "a b"
|
||||
|
||||
|
||||
def test_replace_non_alnum_lower():
|
||||
assert replace_non_alnum_lower("") == ""
|
||||
assert replace_non_alnum_lower("abc 123") == "abc 123"
|
||||
# Per-character replacement: runs are not merged.
|
||||
assert replace_non_alnum_lower("a--b") == "a b"
|
||||
# Existing spaces are kept as-is.
|
||||
assert replace_non_alnum_lower("a , b") == "a b"
|
||||
# Uppercase and accented letters fall outside [a-z0-9 ].
|
||||
assert replace_non_alnum_lower("École") == " cole"
|
||||
|
||||
|
||||
def test_drop_digit_tokens():
|
||||
assert drop_digit_tokens("") == ""
|
||||
assert drop_digit_tokens("10A HIGH STREET") == "HIGH STREET"
|
||||
assert drop_digit_tokens("8B") == ""
|
||||
assert drop_digit_tokens("12 34") == ""
|
||||
assert drop_digit_tokens("KINGSWOOD") == "KINGSWOOD"
|
||||
# Whitespace collapses as a side effect of the token rejoin.
|
||||
assert drop_digit_tokens(" A B ") == "A B"
|
||||
|
||||
|
||||
def test_uppercase_alnum_key_expr():
|
||||
values = [
|
||||
"Flat 2, 10 High Street",
|
||||
" 12 High-Street ",
|
||||
"",
|
||||
None,
|
||||
"Café 1",
|
||||
"st mary's-court",
|
||||
]
|
||||
out = (
|
||||
pl.DataFrame({"a": values}, schema={"a": pl.String})
|
||||
.select(uppercase_alnum_key_expr(pl.col("a")))
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
assert out == [
|
||||
"FLAT 2 10 HIGH STREET",
|
||||
"12 HIGH STREET",
|
||||
"",
|
||||
None,
|
||||
"CAF 1",
|
||||
"ST MARY S COURT",
|
||||
]
|
||||
|
||||
|
||||
# --- Characterization of the call sites built on the primitives ------------
|
||||
# Expected values were captured from the pre-refactor implementations and
|
||||
# must never change: each wrapper's output is byte-for-byte pinned.
|
||||
|
||||
|
||||
def test_normalize_address_key_characterization():
|
||||
values = [
|
||||
"Flat 2, 10 High Street",
|
||||
" 12 High-Street ",
|
||||
"123", # digits only: no letter -> null
|
||||
"", # empty -> null
|
||||
None, # null in, null out
|
||||
"Café 1",
|
||||
"st mary's-court",
|
||||
"ALREADY NORMAL",
|
||||
]
|
||||
out = (
|
||||
pl.DataFrame({"a": values}, schema={"a": pl.String})
|
||||
.select(normalize_address_key(pl.col("a")))
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
assert out == [
|
||||
"FLAT 2 10 HIGH STREET",
|
||||
"12 HIGH STREET",
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
"CAF 1",
|
||||
"ST MARY S COURT",
|
||||
"ALREADY NORMAL",
|
||||
]
|
||||
|
||||
|
||||
def test_street_only_address_characterization():
|
||||
assert _street_only_address("10A HIGH STREET") == "HIGH STREET"
|
||||
assert _street_only_address("FLAT 1 188 GREAT NORTH WAY") == "FLAT GREAT NORTH WAY"
|
||||
assert _street_only_address("") == ""
|
||||
assert _street_only_address("OLDSTEAD ROAD") == "OLDSTEAD ROAD"
|
||||
assert _street_only_address(" A B ") == "A B"
|
||||
assert _street_only_address("12 34") == ""
|
||||
assert _street_only_address("8B") == ""
|
||||
|
||||
|
||||
def test_normalize_grocery_retailer_characterization():
|
||||
assert normalize_grocery_retailer(None) == ""
|
||||
assert normalize_grocery_retailer("") == ""
|
||||
assert normalize_grocery_retailer(" Tesco Express ") == "Tesco Express"
|
||||
assert normalize_grocery_retailer("Sainsburys") == "Sainsbury's"
|
||||
assert normalize_grocery_retailer("Lincolnshire Co-operative") == "Co-op"
|
||||
# Only edge whitespace is stripped; interior whitespace must survive so
|
||||
# near-miss names fall through the exact dictionary lookups unchanged.
|
||||
assert normalize_grocery_retailer("Bob's Shop") == "Bob's Shop"
|
||||
assert normalize_grocery_retailer(" Marks and Spencer ") == "M&S"
|
||||
|
||||
|
||||
def test_normalize_name_characterization():
|
||||
assert normalize_name("St. Mary's C of E Primary School") == (
|
||||
"st marys primary school"
|
||||
)
|
||||
assert normalize_name("St. Mary's C of E Primary School", True) == "st marys"
|
||||
assert normalize_name("") == ""
|
||||
assert normalize_name("Ham & High School") == "ham high school"
|
||||
assert normalize_name("Ham & High School", True) == "ham"
|
||||
# Accented characters become spaces, splitting the word.
|
||||
assert normalize_name("École Élémentaire") == "cole l mentaire"
|
||||
assert normalize_name(" THE KING'S ACADEMY ") == "kings academy"
|
||||
assert normalize_name("Holy Trinity RC Voluntary Aided School") == (
|
||||
"holy trinity school"
|
||||
)
|
||||
assert normalize_name("st. john's") == "st johns"
|
||||
|
||||
|
||||
def test_normalize_la_characterization():
|
||||
assert normalize_la("City of Westminster") == "westminster"
|
||||
assert normalize_la("Brighton & Hove") == "brighton and hove"
|
||||
assert normalize_la(" Kingston upon Thames ") == "kingston upon thames"
|
||||
assert normalize_la("") == ""
|
||||
Loading…
Add table
Add a link
Reference in a new issue