"""Shared low-level text-normalization primitives. Address matching (``pipeline.utils.fuzzy_join``, ``pipeline.transform.merge``), POI retailer cleanup (``pipeline.transform.transform_poi``) and school-name matching (``pipeline.check_school_cutoffs``) each layer domain-specific rules on top of these. The primitives are deliberately tiny and single-purpose so that composing them preserves every caller's existing output byte-for-byte. """ import re import polars as pl # One character outside [a-z0-9 ]. Callers lowercase first; each offending # character becomes a single space (runs are NOT merged here — callers apply # word-level rules and then collapse_whitespace). _NON_ALNUM_LOWER_RE = re.compile(r"[^a-z0-9 ]") # Any digit marks a token as number-bearing (house/flat numbers, including # letter-suffixed forms such as 8A, which still contain a digit). _DIGIT_RE = re.compile(r"\d") def collapse_whitespace(s: str) -> str: """Collapse every whitespace run to a single space and strip the ends.""" return " ".join(s.split()) def strip_or_empty(s: str | None) -> str: """Strip leading/trailing whitespace, mapping None to ``""``. Interior whitespace is preserved (unlike :func:`collapse_whitespace`) so the result can be looked up verbatim against curated dictionary keys. """ return "" if s is None else s.strip() def replace_non_alnum_lower(s: str) -> str: """Replace each character outside [a-z0-9 ] with a single space. Expects already-lowercased input (uppercase letters are replaced too). Replacement is per character, not per run; callers collapse whitespace afterwards. """ return _NON_ALNUM_LOWER_RE.sub(" ", s) def drop_digit_tokens(s: str) -> str: """Drop whitespace-separated tokens that contain any digit. ``"10A HIGH STREET" -> "HIGH STREET"``. The surviving tokens are rejoined with single spaces, so whitespace collapses as a side effect. """ return " ".join(token for token in s.split() if not _DIGIT_RE.search(token)) def uppercase_alnum_key_expr(s: pl.Expr) -> pl.Expr: """Polars expression: uppercase, replace each non-alphanumeric run with a single space, collapse whitespace, and strip the ends. Non-ASCII letters fall outside [0-9A-Z] after uppercasing and become spaces (``"Café 1" -> "CAF 1"``). """ return ( s.cast(pl.String) .str.to_uppercase() .str.replace_all(r"[^0-9A-Z]+", " ") .str.replace_all(r"\s+", " ") .str.strip_chars() )