perfect-postcode/pipeline/utils/test_normalize.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

158 lines
5.5 KiB
Python

import polars as pl
from pipeline.check_school_cutoffs import normalize_la, normalize_name
from pipeline.transform.merge import _street_only_address
from pipeline.transform.transform_poi import normalize_grocery_retailer
from pipeline.utils.fuzzy_join import normalize_address_key
from pipeline.utils.normalize import (
collapse_whitespace,
drop_digit_tokens,
replace_non_alnum_lower,
strip_or_empty,
uppercase_alnum_key_expr,
)
# --- Primitives -------------------------------------------------------------
def test_collapse_whitespace():
assert collapse_whitespace("") == ""
assert collapse_whitespace(" ") == ""
assert collapse_whitespace("a b") == "a b"
assert collapse_whitespace(" a \t b \n c ") == "a b c"
# str.split() also splits on unicode whitespace (non-breaking space).
assert collapse_whitespace("a\u00a0b") == "a b"
def test_strip_or_empty():
assert strip_or_empty(None) == ""
assert strip_or_empty("") == ""
assert strip_or_empty(" x ") == "x"
# Interior whitespace is preserved, unlike collapse_whitespace.
assert strip_or_empty(" a b ") == "a b"
def test_replace_non_alnum_lower():
assert replace_non_alnum_lower("") == ""
assert replace_non_alnum_lower("abc 123") == "abc 123"
# Per-character replacement: runs are not merged.
assert replace_non_alnum_lower("a--b") == "a b"
# Existing spaces are kept as-is.
assert replace_non_alnum_lower("a , b") == "a b"
# Uppercase and accented letters fall outside [a-z0-9 ].
assert replace_non_alnum_lower("École") == " cole"
def test_drop_digit_tokens():
assert drop_digit_tokens("") == ""
assert drop_digit_tokens("10A HIGH STREET") == "HIGH STREET"
assert drop_digit_tokens("8B") == ""
assert drop_digit_tokens("12 34") == ""
assert drop_digit_tokens("KINGSWOOD") == "KINGSWOOD"
# Whitespace collapses as a side effect of the token rejoin.
assert drop_digit_tokens(" A B ") == "A B"
def test_uppercase_alnum_key_expr():
values = [
"Flat 2, 10 High Street",
" 12 High-Street ",
"",
None,
"Café 1",
"st mary's-court",
]
out = (
pl.DataFrame({"a": values}, schema={"a": pl.String})
.select(uppercase_alnum_key_expr(pl.col("a")))
.to_series()
.to_list()
)
assert out == [
"FLAT 2 10 HIGH STREET",
"12 HIGH STREET",
"",
None,
"CAF 1",
"ST MARY S COURT",
]
# --- Characterization of the call sites built on the primitives ------------
# Expected values were captured from the pre-refactor implementations and
# must never change: each wrapper's output is byte-for-byte pinned.
def test_normalize_address_key_characterization():
values = [
"Flat 2, 10 High Street",
" 12 High-Street ",
"123", # digits only: no letter -> null
"", # empty -> null
None, # null in, null out
"Café 1",
"st mary's-court",
"ALREADY NORMAL",
]
out = (
pl.DataFrame({"a": values}, schema={"a": pl.String})
.select(normalize_address_key(pl.col("a")))
.to_series()
.to_list()
)
assert out == [
"FLAT 2 10 HIGH STREET",
"12 HIGH STREET",
None,
None,
None,
"CAF 1",
"ST MARY S COURT",
"ALREADY NORMAL",
]
def test_street_only_address_characterization():
assert _street_only_address("10A HIGH STREET") == "HIGH STREET"
assert _street_only_address("FLAT 1 188 GREAT NORTH WAY") == "FLAT GREAT NORTH WAY"
assert _street_only_address("") == ""
assert _street_only_address("OLDSTEAD ROAD") == "OLDSTEAD ROAD"
assert _street_only_address(" A B ") == "A B"
assert _street_only_address("12 34") == ""
assert _street_only_address("8B") == ""
def test_normalize_grocery_retailer_characterization():
assert normalize_grocery_retailer(None) == ""
assert normalize_grocery_retailer("") == ""
assert normalize_grocery_retailer(" Tesco Express ") == "Tesco Express"
assert normalize_grocery_retailer("Sainsburys") == "Sainsbury's"
assert normalize_grocery_retailer("Lincolnshire Co-operative") == "Co-op"
# Only edge whitespace is stripped; interior whitespace must survive so
# near-miss names fall through the exact dictionary lookups unchanged.
assert normalize_grocery_retailer("Bob's Shop") == "Bob's Shop"
assert normalize_grocery_retailer(" Marks and Spencer ") == "M&S"
def test_normalize_name_characterization():
assert normalize_name("St. Mary's C of E Primary School") == (
"st marys primary school"
)
assert normalize_name("St. Mary's C of E Primary School", True) == "st marys"
assert normalize_name("") == ""
assert normalize_name("Ham & High School") == "ham high school"
assert normalize_name("Ham & High School", True) == "ham"
# Accented characters become spaces, splitting the word.
assert normalize_name("École Élémentaire") == "cole l mentaire"
assert normalize_name(" THE KING'S ACADEMY ") == "kings academy"
assert normalize_name("Holy Trinity RC Voluntary Aided School") == (
"holy trinity school"
)
assert normalize_name("st. john's") == "st johns"
def test_normalize_la_characterization():
assert normalize_la("City of Westminster") == "westminster"
assert normalize_la("Brighton & Hove") == "brighton and hove"
assert normalize_la(" Kingston upon Thames ") == "kingston upon thames"
assert normalize_la("") == ""