idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/utils/init.py
+++ b/pipeline/utils/init.py
@ -5,6 +5,7 @@ from .fuzzy_join import (
    normalize_postcode_key,
 )
 from .haversine import haversine_km, haversine_km_expr
+from .nspl_schema import code_col_overrides
 from .poi_counts import count_pois_per_postcode
 from .postcode_mapping import build_postcode_mapping

@ -16,6 +17,7 @@ __all__ = [
    "normalize_postcode_key",
    "haversine_km",
    "haversine_km_expr",
+    "code_col_overrides",
    "count_pois_per_postcode",
    "build_postcode_mapping",
 ]
--- a/pipeline/utils/nspl_schema.py
+++ b/pipeline/utils/nspl_schema.py
@ -0,0 +1,30 @@
+"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
+
+NSPL/NSUL embed the source year in classification-index column names
+(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
+release. These columns look numeric in early rows but contain string codes
+like "UN1" (Unclassified) further down, so they must be forced to String
+before scanning — otherwise polars infers Int64 and crashes mid-stream.
+
+Hard-coding the year suffix is fragile: polars silently ignores overrides for
+columns that don't exist, so a renamed suffix would no-op and reintroduce the
+crash. Match on the suffix-free stem instead.
+"""
+
+import re
+
+import polars as pl
+
+# Matches ruc/oac/imd classification-index columns regardless of year suffix:
+# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
+CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
+
+
+def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
+    """Build a String schema-override dict for NSPL/NSUL code columns.
+
+    Given the column names of an NSPL/NSUL CSV, return overrides forcing every
+    RUC/OAC/IMD classification-index column to String, independent of the year
+    suffix ONS happens to use this release.
+    """
+    return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}
--- a/pipeline/utils/test_nspl_schema.py
+++ b/pipeline/utils/test_nspl_schema.py
@ -0,0 +1,57 @@
+import polars as pl
+
+from pipeline.utils.nspl_schema import CODE_COL_PATTERN, code_col_overrides
+
+
+def test_matches_current_and_renamed_suffixes():
+    names = ["pcd", "ruc21ind", "oac11ind", "imd20ind", "lat", "long"]
+    overrides = code_col_overrides(names)
+    assert overrides == {
+        "ruc21ind": pl.String,
+        "oac11ind": pl.String,
+        "imd20ind": pl.String,
+    }
+
+
+def test_catches_future_suffix_bump():
+    # The regression: ONS bumps imd20ind -> imd25ind. A hard-coded dict would
+    # no-op here; the stem match must still catch it.
+    names = ["pcd", "ruc25ind", "oac21ind", "imd25ind"]
+    overrides = code_col_overrides(names)
+    assert set(overrides) == {"ruc25ind", "oac21ind", "imd25ind"}
+    assert all(dtype is pl.String for dtype in overrides.values())
+
+
+def test_is_case_insensitive():
+    overrides = code_col_overrides(["RUC21IND", "Oac11Ind", "IMD20ind"])
+    assert set(overrides) == {"RUC21IND", "Oac11Ind", "IMD20ind"}
+
+
+def test_matches_suffixless_stem():
+    assert set(code_col_overrides(["rucind", "oacind", "imdind"])) == {
+        "rucind",
+        "oacind",
+        "imdind",
+    }
+
+
+def test_ignores_unrelated_columns():
+    names = [
+        "pcd",
+        "laua",
+        "imdscore",  # not an *ind column
+        "indicator",  # ends in nothing relevant, no ruc/oac/imd stem
+        "ruc21indx",  # extra trailing char -> not a code column
+        "xruc21ind",  # leading char -> not anchored at start
+    ]
+    assert code_col_overrides(names) == {}
+
+
+def test_empty_names():
+    assert code_col_overrides([]) == {}
+
+
+def test_pattern_anchored():
+    assert CODE_COL_PATTERN.match("imd25ind")
+    assert not CODE_COL_PATTERN.match("oac11indicator")
+    assert not CODE_COL_PATTERN.match("region_imd20ind")