158 lines
5.5 KiB
Python
158 lines
5.5 KiB
Python
import polars as pl
|
|
|
|
from pipeline.check_school_cutoffs import normalize_la, normalize_name
|
|
from pipeline.transform.merge import _street_only_address
|
|
from pipeline.transform.transform_poi import normalize_grocery_retailer
|
|
from pipeline.utils.fuzzy_join import normalize_address_key
|
|
from pipeline.utils.normalize import (
|
|
collapse_whitespace,
|
|
drop_digit_tokens,
|
|
replace_non_alnum_lower,
|
|
strip_or_empty,
|
|
uppercase_alnum_key_expr,
|
|
)
|
|
|
|
# --- Primitives -------------------------------------------------------------
|
|
|
|
|
|
def test_collapse_whitespace():
|
|
assert collapse_whitespace("") == ""
|
|
assert collapse_whitespace(" ") == ""
|
|
assert collapse_whitespace("a b") == "a b"
|
|
assert collapse_whitespace(" a \t b \n c ") == "a b c"
|
|
# str.split() also splits on unicode whitespace (non-breaking space).
|
|
assert collapse_whitespace("a\u00a0b") == "a b"
|
|
|
|
|
|
def test_strip_or_empty():
|
|
assert strip_or_empty(None) == ""
|
|
assert strip_or_empty("") == ""
|
|
assert strip_or_empty(" x ") == "x"
|
|
# Interior whitespace is preserved, unlike collapse_whitespace.
|
|
assert strip_or_empty(" a b ") == "a b"
|
|
|
|
|
|
def test_replace_non_alnum_lower():
|
|
assert replace_non_alnum_lower("") == ""
|
|
assert replace_non_alnum_lower("abc 123") == "abc 123"
|
|
# Per-character replacement: runs are not merged.
|
|
assert replace_non_alnum_lower("a--b") == "a b"
|
|
# Existing spaces are kept as-is.
|
|
assert replace_non_alnum_lower("a , b") == "a b"
|
|
# Uppercase and accented letters fall outside [a-z0-9 ].
|
|
assert replace_non_alnum_lower("École") == " cole"
|
|
|
|
|
|
def test_drop_digit_tokens():
|
|
assert drop_digit_tokens("") == ""
|
|
assert drop_digit_tokens("10A HIGH STREET") == "HIGH STREET"
|
|
assert drop_digit_tokens("8B") == ""
|
|
assert drop_digit_tokens("12 34") == ""
|
|
assert drop_digit_tokens("KINGSWOOD") == "KINGSWOOD"
|
|
# Whitespace collapses as a side effect of the token rejoin.
|
|
assert drop_digit_tokens(" A B ") == "A B"
|
|
|
|
|
|
def test_uppercase_alnum_key_expr():
|
|
values = [
|
|
"Flat 2, 10 High Street",
|
|
" 12 High-Street ",
|
|
"",
|
|
None,
|
|
"Café 1",
|
|
"st mary's-court",
|
|
]
|
|
out = (
|
|
pl.DataFrame({"a": values}, schema={"a": pl.String})
|
|
.select(uppercase_alnum_key_expr(pl.col("a")))
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
assert out == [
|
|
"FLAT 2 10 HIGH STREET",
|
|
"12 HIGH STREET",
|
|
"",
|
|
None,
|
|
"CAF 1",
|
|
"ST MARY S COURT",
|
|
]
|
|
|
|
|
|
# --- Characterization of the call sites built on the primitives ------------
|
|
# Expected values were captured from the pre-refactor implementations and
|
|
# must never change: each wrapper's output is byte-for-byte pinned.
|
|
|
|
|
|
def test_normalize_address_key_characterization():
|
|
values = [
|
|
"Flat 2, 10 High Street",
|
|
" 12 High-Street ",
|
|
"123", # digits only: no letter -> null
|
|
"", # empty -> null
|
|
None, # null in, null out
|
|
"Café 1",
|
|
"st mary's-court",
|
|
"ALREADY NORMAL",
|
|
]
|
|
out = (
|
|
pl.DataFrame({"a": values}, schema={"a": pl.String})
|
|
.select(normalize_address_key(pl.col("a")))
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
assert out == [
|
|
"FLAT 2 10 HIGH STREET",
|
|
"12 HIGH STREET",
|
|
None,
|
|
None,
|
|
None,
|
|
"CAF 1",
|
|
"ST MARY S COURT",
|
|
"ALREADY NORMAL",
|
|
]
|
|
|
|
|
|
def test_street_only_address_characterization():
|
|
assert _street_only_address("10A HIGH STREET") == "HIGH STREET"
|
|
assert _street_only_address("FLAT 1 188 GREAT NORTH WAY") == "FLAT GREAT NORTH WAY"
|
|
assert _street_only_address("") == ""
|
|
assert _street_only_address("OLDSTEAD ROAD") == "OLDSTEAD ROAD"
|
|
assert _street_only_address(" A B ") == "A B"
|
|
assert _street_only_address("12 34") == ""
|
|
assert _street_only_address("8B") == ""
|
|
|
|
|
|
def test_normalize_grocery_retailer_characterization():
|
|
assert normalize_grocery_retailer(None) == ""
|
|
assert normalize_grocery_retailer("") == ""
|
|
assert normalize_grocery_retailer(" Tesco Express ") == "Tesco Express"
|
|
assert normalize_grocery_retailer("Sainsburys") == "Sainsbury's"
|
|
assert normalize_grocery_retailer("Lincolnshire Co-operative") == "Co-op"
|
|
# Only edge whitespace is stripped; interior whitespace must survive so
|
|
# near-miss names fall through the exact dictionary lookups unchanged.
|
|
assert normalize_grocery_retailer("Bob's Shop") == "Bob's Shop"
|
|
assert normalize_grocery_retailer(" Marks and Spencer ") == "M&S"
|
|
|
|
|
|
def test_normalize_name_characterization():
|
|
assert normalize_name("St. Mary's C of E Primary School") == (
|
|
"st marys primary school"
|
|
)
|
|
assert normalize_name("St. Mary's C of E Primary School", True) == "st marys"
|
|
assert normalize_name("") == ""
|
|
assert normalize_name("Ham & High School") == "ham high school"
|
|
assert normalize_name("Ham & High School", True) == "ham"
|
|
# Accented characters become spaces, splitting the word.
|
|
assert normalize_name("École Élémentaire") == "cole l mentaire"
|
|
assert normalize_name(" THE KING'S ACADEMY ") == "kings academy"
|
|
assert normalize_name("Holy Trinity RC Voluntary Aided School") == (
|
|
"holy trinity school"
|
|
)
|
|
assert normalize_name("st. john's") == "st johns"
|
|
|
|
|
|
def test_normalize_la_characterization():
|
|
assert normalize_la("City of Westminster") == "westminster"
|
|
assert normalize_la("Brighton & Hove") == "brighton and hove"
|
|
assert normalize_la(" Kingston upon Thames ") == "kingston upon thames"
|
|
assert normalize_la("") == ""
|