SPlit up

2026-06-12 21:51:37 +01:00 · 2026-06-12 21:51:37 +01:00 · f59d01227b
commit f59d01227b
parent cf39ad754e
91 changed files with 10370 additions and 7562 deletions
--- a/pipeline/utils/normalize.py
+++ b/pipeline/utils/normalize.py
@ -0,0 +1,70 @@
+"""Shared low-level text-normalization primitives.
+
+Address matching (``pipeline.utils.fuzzy_join``, ``pipeline.transform.merge``),
+POI retailer cleanup (``pipeline.transform.transform_poi``) and school-name
+matching (``pipeline.check_school_cutoffs``) each layer domain-specific rules
+on top of these. The primitives are deliberately tiny and single-purpose so
+that composing them preserves every caller's existing output byte-for-byte.
+"""
+
+import re
+
+import polars as pl
+
+# One character outside [a-z0-9 ]. Callers lowercase first; each offending
+# character becomes a single space (runs are NOT merged here — callers apply
+# word-level rules and then collapse_whitespace).
+_NON_ALNUM_LOWER_RE = re.compile(r"[^a-z0-9 ]")
+
+# Any digit marks a token as number-bearing (house/flat numbers, including
+# letter-suffixed forms such as 8A, which still contain a digit).
+_DIGIT_RE = re.compile(r"\d")
+
+
+def collapse_whitespace(s: str) -> str:
+    """Collapse every whitespace run to a single space and strip the ends."""
+    return " ".join(s.split())
+
+
+def strip_or_empty(s: str | None) -> str:
+    """Strip leading/trailing whitespace, mapping None to ``""``.
+
+    Interior whitespace is preserved (unlike :func:`collapse_whitespace`) so
+    the result can be looked up verbatim against curated dictionary keys.
+    """
+    return "" if s is None else s.strip()
+
+
+def replace_non_alnum_lower(s: str) -> str:
+    """Replace each character outside [a-z0-9 ] with a single space.
+
+    Expects already-lowercased input (uppercase letters are replaced too).
+    Replacement is per character, not per run; callers collapse whitespace
+    afterwards.
+    """
+    return _NON_ALNUM_LOWER_RE.sub(" ", s)
+
+
+def drop_digit_tokens(s: str) -> str:
+    """Drop whitespace-separated tokens that contain any digit.
+
+    ``"10A HIGH STREET" -> "HIGH STREET"``. The surviving tokens are rejoined
+    with single spaces, so whitespace collapses as a side effect.
+    """
+    return " ".join(token for token in s.split() if not _DIGIT_RE.search(token))
+
+
+def uppercase_alnum_key_expr(s: pl.Expr) -> pl.Expr:
+    """Polars expression: uppercase, replace each non-alphanumeric run with a
+    single space, collapse whitespace, and strip the ends.
+
+    Non-ASCII letters fall outside [0-9A-Z] after uppercasing and become
+    spaces (``"Café 1" -> "CAF 1"``).
+    """
+    return (
+        s.cast(pl.String)
+        .str.to_uppercase()
+        .str.replace_all(r"[^0-9A-Z]+", " ")
+        .str.replace_all(r"\s+", " ")
+        .str.strip_chars()
+    )