Fable findings in data
This commit is contained in:
parent
b98bc6d611
commit
6a33b03fdf
20 changed files with 1502 additions and 274 deletions
|
|
@ -11,7 +11,12 @@ from tqdm import tqdm
|
|||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
# A house-number token includes any letter suffix: 8A, 8B and plain 8 are
|
||||
# three different properties on the same street, so digit-only extraction
|
||||
# (which collapsed all three to "8") is not enough. Addresses are passed
|
||||
# through normalize_address_key first, so tokens are uppercase and
|
||||
# space-separated and [A-Z] suffices for the suffix.
|
||||
_NUMBER_RE = re.compile(r"\d+[A-Z]?")
|
||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||
# A house number is a strong disambiguator, so a numbered, number-compatible
|
||||
# pair may match on a lower address-similarity score than a number-less one
|
||||
|
|
@ -61,8 +66,10 @@ def fuzzy_join_on_postcode(
|
|||
columns (index, address, postcode) via projection pushdown, and the
|
||||
final join reads the remaining columns lazily.
|
||||
|
||||
Returns a LazyFrame with all left and right columns. Unmatched rows
|
||||
have null right columns.
|
||||
Returns a LazyFrame with all left and right columns, plus a
|
||||
``_match_score`` (UInt8) audit column holding the token_sort_ratio of
|
||||
the accepted match (exact matches score 100). Unmatched rows have null
|
||||
right columns and a null score.
|
||||
"""
|
||||
|
||||
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
|
||||
|
|
@ -152,14 +159,17 @@ def fuzzy_join_on_postcode(
|
|||
# Sort descending by score so best matches are assigned first
|
||||
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
|
||||
|
||||
matches: list[tuple[int, int]] = []
|
||||
# Keep the score alongside each accepted pair: it is emitted as the
|
||||
# _match_score audit column so downstream consumers can distinguish
|
||||
# exact (100) from looser fuzzy matches.
|
||||
matches: list[tuple[int, int, int]] = [] # (left_idx, right_idx, score)
|
||||
matched_left: set[int] = set()
|
||||
matched_right: set[int] = set()
|
||||
|
||||
for _score, left_idx, right_idx in all_pairs:
|
||||
for score, left_idx, right_idx in all_pairs:
|
||||
if left_idx in matched_left or right_idx in matched_right:
|
||||
continue
|
||||
matches.append((left_idx, right_idx))
|
||||
matches.append((left_idx, right_idx, score))
|
||||
matched_left.add(left_idx)
|
||||
matched_right.add(right_idx)
|
||||
|
||||
|
|
@ -171,6 +181,7 @@ def fuzzy_join_on_postcode(
|
|||
{
|
||||
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||
"_match_score": pl.Series([m[2] for m in matches], dtype=pl.UInt8),
|
||||
}
|
||||
)
|
||||
else:
|
||||
|
|
@ -178,6 +189,7 @@ def fuzzy_join_on_postcode(
|
|||
{
|
||||
"_left_idx": pl.Series([], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([], dtype=pl.UInt32),
|
||||
"_match_score": pl.Series([], dtype=pl.UInt8),
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -197,18 +209,26 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
|
||||
def _numbers_compatible(a: str, b: str) -> bool:
|
||||
"""Check that numeric tokens (flat/house numbers) in the shorter set are a subset of the longer.
|
||||
"""Check that the number tokens (house/flat numbers, including any letter
|
||||
suffix) of two addresses are IDENTICAL sets.
|
||||
|
||||
Returns False if one address has numbers and the other doesn't.
|
||||
Equality, not subset: subset logic let "188 GREAT NORTH WAY" absorb
|
||||
"FLAT 1 188 GREAT NORTH WAY" ({188} is a subset of {1, 188}), attaching a
|
||||
single flat's EPC facts to the whole building — tens of thousands of
|
||||
wrong-property matches. Likewise digit-only tokens made "8A" and "8B"
|
||||
both look like {8} and match each other (and plain "8"). Precision over
|
||||
recall: a pair whose two sources genuinely disagree on number tokens is
|
||||
safer left unmatched.
|
||||
|
||||
One side numbered, the other not -> incompatible. Neither numbered ->
|
||||
compatible; such pairs are scored against the stricter no-numbers
|
||||
threshold instead.
|
||||
"""
|
||||
nums_a = set(_NUMBER_RE.findall(a))
|
||||
nums_b = set(_NUMBER_RE.findall(b))
|
||||
smaller, larger = (
|
||||
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||
)
|
||||
if not smaller and larger:
|
||||
return False
|
||||
return smaller.issubset(larger)
|
||||
if not nums_a and not nums_b:
|
||||
return True
|
||||
return nums_a == nums_b
|
||||
|
||||
|
||||
def _score_bucket(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue