import polars as pl from pipeline.check_school_cutoffs import normalize_la, normalize_name from pipeline.transform.merge import _street_only_address from pipeline.transform.transform_poi import normalize_grocery_retailer from pipeline.utils.fuzzy_join import normalize_address_key from pipeline.utils.normalize import ( collapse_whitespace, drop_digit_tokens, replace_non_alnum_lower, strip_or_empty, uppercase_alnum_key_expr, ) # --- Primitives ------------------------------------------------------------- def test_collapse_whitespace(): assert collapse_whitespace("") == "" assert collapse_whitespace(" ") == "" assert collapse_whitespace("a b") == "a b" assert collapse_whitespace(" a \t b \n c ") == "a b c" # str.split() also splits on unicode whitespace (non-breaking space). assert collapse_whitespace("a\u00a0b") == "a b" def test_strip_or_empty(): assert strip_or_empty(None) == "" assert strip_or_empty("") == "" assert strip_or_empty(" x ") == "x" # Interior whitespace is preserved, unlike collapse_whitespace. assert strip_or_empty(" a b ") == "a b" def test_replace_non_alnum_lower(): assert replace_non_alnum_lower("") == "" assert replace_non_alnum_lower("abc 123") == "abc 123" # Per-character replacement: runs are not merged. assert replace_non_alnum_lower("a--b") == "a b" # Existing spaces are kept as-is. assert replace_non_alnum_lower("a , b") == "a b" # Uppercase and accented letters fall outside [a-z0-9 ]. assert replace_non_alnum_lower("École") == " cole" def test_drop_digit_tokens(): assert drop_digit_tokens("") == "" assert drop_digit_tokens("10A HIGH STREET") == "HIGH STREET" assert drop_digit_tokens("8B") == "" assert drop_digit_tokens("12 34") == "" assert drop_digit_tokens("KINGSWOOD") == "KINGSWOOD" # Whitespace collapses as a side effect of the token rejoin. assert drop_digit_tokens(" A B ") == "A B" def test_uppercase_alnum_key_expr(): values = [ "Flat 2, 10 High Street", " 12 High-Street ", "", None, "Café 1", "st mary's-court", ] out = ( pl.DataFrame({"a": values}, schema={"a": pl.String}) .select(uppercase_alnum_key_expr(pl.col("a"))) .to_series() .to_list() ) assert out == [ "FLAT 2 10 HIGH STREET", "12 HIGH STREET", "", None, "CAF 1", "ST MARY S COURT", ] # --- Characterization of the call sites built on the primitives ------------ # Expected values were captured from the pre-refactor implementations and # must never change: each wrapper's output is byte-for-byte pinned. def test_normalize_address_key_characterization(): values = [ "Flat 2, 10 High Street", " 12 High-Street ", "123", # digits only: no letter -> null "", # empty -> null None, # null in, null out "Café 1", "st mary's-court", "ALREADY NORMAL", ] out = ( pl.DataFrame({"a": values}, schema={"a": pl.String}) .select(normalize_address_key(pl.col("a"))) .to_series() .to_list() ) assert out == [ "FLAT 2 10 HIGH STREET", "12 HIGH STREET", None, None, None, "CAF 1", "ST MARY S COURT", "ALREADY NORMAL", ] def test_street_only_address_characterization(): assert _street_only_address("10A HIGH STREET") == "HIGH STREET" assert _street_only_address("FLAT 1 188 GREAT NORTH WAY") == "FLAT GREAT NORTH WAY" assert _street_only_address("") == "" assert _street_only_address("OLDSTEAD ROAD") == "OLDSTEAD ROAD" assert _street_only_address(" A B ") == "A B" assert _street_only_address("12 34") == "" assert _street_only_address("8B") == "" def test_normalize_grocery_retailer_characterization(): assert normalize_grocery_retailer(None) == "" assert normalize_grocery_retailer("") == "" assert normalize_grocery_retailer(" Tesco Express ") == "Tesco Express" assert normalize_grocery_retailer("Sainsburys") == "Sainsbury's" assert normalize_grocery_retailer("Lincolnshire Co-operative") == "Co-op" # Only edge whitespace is stripped; interior whitespace must survive so # near-miss names fall through the exact dictionary lookups unchanged. assert normalize_grocery_retailer("Bob's Shop") == "Bob's Shop" assert normalize_grocery_retailer(" Marks and Spencer ") == "M&S" def test_normalize_name_characterization(): assert normalize_name("St. Mary's C of E Primary School") == ( "st marys primary school" ) assert normalize_name("St. Mary's C of E Primary School", True) == "st marys" assert normalize_name("") == "" assert normalize_name("Ham & High School") == "ham high school" assert normalize_name("Ham & High School", True) == "ham" # Accented characters become spaces, splitting the word. assert normalize_name("École Élémentaire") == "cole l mentaire" assert normalize_name(" THE KING'S ACADEMY ") == "kings academy" assert normalize_name("Holy Trinity RC Voluntary Aided School") == ( "holy trinity school" ) assert normalize_name("st. john's") == "st johns" def test_normalize_la_characterization(): assert normalize_la("City of Westminster") == "westminster" assert normalize_la("Brighton & Hove") == "brighton and hove" assert normalize_la(" Kingston upon Thames ") == "kingston upon thames" assert normalize_la("") == ""