70 lines
2.5 KiB
Python
70 lines
2.5 KiB
Python
"""Shared low-level text-normalization primitives.
|
|
|
|
Address matching (``pipeline.utils.fuzzy_join``, ``pipeline.transform.merge``),
|
|
POI retailer cleanup (``pipeline.transform.transform_poi``) and school-name
|
|
matching (``pipeline.check_school_cutoffs``) each layer domain-specific rules
|
|
on top of these. The primitives are deliberately tiny and single-purpose so
|
|
that composing them preserves every caller's existing output byte-for-byte.
|
|
"""
|
|
|
|
import re
|
|
|
|
import polars as pl
|
|
|
|
# One character outside [a-z0-9 ]. Callers lowercase first; each offending
|
|
# character becomes a single space (runs are NOT merged here — callers apply
|
|
# word-level rules and then collapse_whitespace).
|
|
_NON_ALNUM_LOWER_RE = re.compile(r"[^a-z0-9 ]")
|
|
|
|
# Any digit marks a token as number-bearing (house/flat numbers, including
|
|
# letter-suffixed forms such as 8A, which still contain a digit).
|
|
_DIGIT_RE = re.compile(r"\d")
|
|
|
|
|
|
def collapse_whitespace(s: str) -> str:
|
|
"""Collapse every whitespace run to a single space and strip the ends."""
|
|
return " ".join(s.split())
|
|
|
|
|
|
def strip_or_empty(s: str | None) -> str:
|
|
"""Strip leading/trailing whitespace, mapping None to ``""``.
|
|
|
|
Interior whitespace is preserved (unlike :func:`collapse_whitespace`) so
|
|
the result can be looked up verbatim against curated dictionary keys.
|
|
"""
|
|
return "" if s is None else s.strip()
|
|
|
|
|
|
def replace_non_alnum_lower(s: str) -> str:
|
|
"""Replace each character outside [a-z0-9 ] with a single space.
|
|
|
|
Expects already-lowercased input (uppercase letters are replaced too).
|
|
Replacement is per character, not per run; callers collapse whitespace
|
|
afterwards.
|
|
"""
|
|
return _NON_ALNUM_LOWER_RE.sub(" ", s)
|
|
|
|
|
|
def drop_digit_tokens(s: str) -> str:
|
|
"""Drop whitespace-separated tokens that contain any digit.
|
|
|
|
``"10A HIGH STREET" -> "HIGH STREET"``. The surviving tokens are rejoined
|
|
with single spaces, so whitespace collapses as a side effect.
|
|
"""
|
|
return " ".join(token for token in s.split() if not _DIGIT_RE.search(token))
|
|
|
|
|
|
def uppercase_alnum_key_expr(s: pl.Expr) -> pl.Expr:
|
|
"""Polars expression: uppercase, replace each non-alphanumeric run with a
|
|
single space, collapse whitespace, and strip the ends.
|
|
|
|
Non-ASCII letters fall outside [0-9A-Z] after uppercasing and become
|
|
spaces (``"Café 1" -> "CAF 1"``).
|
|
"""
|
|
return (
|
|
s.cast(pl.String)
|
|
.str.to_uppercase()
|
|
.str.replace_all(r"[^0-9A-Z]+", " ")
|
|
.str.replace_all(r"\s+", " ")
|
|
.str.strip_chars()
|
|
)
|