import polars as pl from pipeline.utils.nspl_schema import CODE_COL_PATTERN, code_col_overrides def test_matches_current_and_renamed_suffixes(): names = ["pcd", "ruc21ind", "oac11ind", "imd20ind", "lat", "long"] overrides = code_col_overrides(names) assert overrides == { "ruc21ind": pl.String, "oac11ind": pl.String, "imd20ind": pl.String, } def test_catches_future_suffix_bump(): # The regression: ONS bumps imd20ind -> imd25ind. A hard-coded dict would # no-op here; the stem match must still catch it. names = ["pcd", "ruc25ind", "oac21ind", "imd25ind"] overrides = code_col_overrides(names) assert set(overrides) == {"ruc25ind", "oac21ind", "imd25ind"} assert all(dtype is pl.String for dtype in overrides.values()) def test_is_case_insensitive(): overrides = code_col_overrides(["RUC21IND", "Oac11Ind", "IMD20ind"]) assert set(overrides) == {"RUC21IND", "Oac11Ind", "IMD20ind"} def test_matches_suffixless_stem(): assert set(code_col_overrides(["rucind", "oacind", "imdind"])) == { "rucind", "oacind", "imdind", } def test_ignores_unrelated_columns(): names = [ "pcd", "laua", "imdscore", # not an *ind column "indicator", # ends in nothing relevant, no ruc/oac/imd stem "ruc21indx", # extra trailing char -> not a code column "xruc21ind", # leading char -> not anchored at start ] assert code_col_overrides(names) == {} def test_empty_names(): assert code_col_overrides([]) == {} def test_pattern_anchored(): assert CODE_COL_PATTERN.match("imd25ind") assert not CODE_COL_PATTERN.match("oac11indicator") assert not CODE_COL_PATTERN.match("region_imd20ind")