idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -13,7 +13,12 @@ from pipeline.local_temp import local_tmp_dir
|
|||
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||
MIN_FUZZY_SCORE = 60
|
||||
# A house number is a strong disambiguator, so a numbered, number-compatible
|
||||
# pair may match on a lower address-similarity score than a number-less one
|
||||
# (named houses / flats by building name), which must match almost exactly to
|
||||
# be trusted. Mirrors merge.py's listings convention.
|
||||
MIN_FUZZY_SCORE = 82
|
||||
MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
|
||||
|
||||
|
||||
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
||||
|
|
@ -47,6 +52,7 @@ def fuzzy_join_on_postcode(
|
|||
left_postcode_col: str,
|
||||
right_postcode_col: str,
|
||||
min_score: int = MIN_FUZZY_SCORE,
|
||||
min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
||||
|
||||
|
|
@ -120,7 +126,12 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Build tasks for each postcode bucket
|
||||
tasks = [
|
||||
(left_entries, right_by_postcode[postcode], min_score)
|
||||
(
|
||||
left_entries,
|
||||
right_by_postcode[postcode],
|
||||
min_score,
|
||||
min_score_without_numbers,
|
||||
)
|
||||
for postcode, left_entries in left_by_postcode.items()
|
||||
if postcode in right_by_postcode
|
||||
]
|
||||
|
|
@ -201,16 +212,23 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
left_entries, right_entries, min_score = args
|
||||
left_entries, right_entries, min_score, min_score_without_numbers = args
|
||||
pairs = []
|
||||
for left_row, left_address in left_entries:
|
||||
for right_row, right_address in right_entries:
|
||||
if not _numbers_compatible(left_address, right_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(left_address, right_address)
|
||||
if score >= min_score:
|
||||
# Number-less pairs (named houses, building-name flats) lack the
|
||||
# house-number disambiguator, so require a near-exact match.
|
||||
threshold = (
|
||||
min_score
|
||||
if _NUMBER_RE.search(left_address) or _NUMBER_RE.search(right_address)
|
||||
else min_score_without_numbers
|
||||
)
|
||||
if score >= threshold:
|
||||
pairs.append((score, left_row, right_row))
|
||||
return pairs
|
||||
|
|
|
|||
|
|
@ -6,6 +6,16 @@ import numpy as np
|
|||
import polars as pl
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
|
||||
# nearest active successor to be remapped. Beyond this we treat the postcode as having no
|
||||
# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
|
||||
# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
|
||||
# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
|
||||
# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
|
||||
# misattributions; dropped postcodes keep their terminated code and fall out at the
|
||||
# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
|
||||
MAX_REMAP_DISTANCE_M = 1000.0
|
||||
|
||||
|
||||
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
||||
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
||||
|
|
@ -50,18 +60,30 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
|||
)
|
||||
|
||||
tree = cKDTree(active_coords)
|
||||
distances, indices = tree.query(terminated_coords)
|
||||
distances, indices = tree.query(
|
||||
terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
|
||||
)
|
||||
|
||||
# cKDTree returns distance=inf and index==len(active) for points with no neighbour
|
||||
# within the bound. Drop those terminated postcodes rather than gather an out-of-range
|
||||
# index; they keep their terminated code and fall out at the active-postcode filter.
|
||||
within_bound = np.isfinite(distances)
|
||||
dropped = int((~within_bound).sum())
|
||||
|
||||
active_postcodes = active["pcds"]
|
||||
mapping = pl.DataFrame(
|
||||
{
|
||||
"old_postcode": terminated["pcds"],
|
||||
"new_postcode": active_postcodes.gather(indices),
|
||||
"old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
|
||||
"new_postcode": active_postcodes.gather(indices[within_bound]),
|
||||
}
|
||||
)
|
||||
|
||||
kept_distances = distances[within_bound]
|
||||
print(
|
||||
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
|
||||
f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
|
||||
f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
|
||||
if kept_distances.size
|
||||
else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
|
||||
)
|
||||
|
||||
return mapping
|
||||
|
|
|
|||
|
|
@ -134,6 +134,91 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
|||
]
|
||||
|
||||
|
||||
def test_fuzzy_join_rejects_mid_score_number_less_match():
|
||||
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
|
||||
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
|
||||
# the number-less threshold of 90, so it must NOT match now.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["The Coach House"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["The Old Coach House"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
||||
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
|
||||
# house number is compatible, so the numbered baseline (>= 82) still matches.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["10 Acacia Avenue"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["Flat A, 10 Acacia Avenue"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
|
||||
|
||||
|
||||
def test_fuzzy_join_matches_high_score_number_less_pair():
|
||||
# A number-less pair that clears the 90 threshold (here an exact token match,
|
||||
# score 100) must still match.
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["The Old Rectory"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["THE OLD RECTORY"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
|
||||
|
||||
|
||||
def test_normalize_postcode_key_requires_full_postcode():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue