This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -5,6 +5,7 @@ from .fuzzy_join import (
normalize_postcode_key,
)
from .haversine import haversine_km, haversine_km_expr
from .nspl_schema import code_col_overrides
from .poi_counts import count_pois_per_postcode
from .postcode_mapping import build_postcode_mapping
@ -16,6 +17,7 @@ __all__ = [
"normalize_postcode_key",
"haversine_km",
"haversine_km_expr",
"code_col_overrides",
"count_pois_per_postcode",
"build_postcode_mapping",
]

View file

@ -0,0 +1,30 @@
"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
NSPL/NSUL embed the source year in classification-index column names
(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
release. These columns look numeric in early rows but contain string codes
like "UN1" (Unclassified) further down, so they must be forced to String
before scanning otherwise polars infers Int64 and crashes mid-stream.
Hard-coding the year suffix is fragile: polars silently ignores overrides for
columns that don't exist, so a renamed suffix would no-op and reintroduce the
crash. Match on the suffix-free stem instead.
"""
import re
import polars as pl
# Matches ruc/oac/imd classification-index columns regardless of year suffix:
# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
"""Build a String schema-override dict for NSPL/NSUL code columns.
Given the column names of an NSPL/NSUL CSV, return overrides forcing every
RUC/OAC/IMD classification-index column to String, independent of the year
suffix ONS happens to use this release.
"""
return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}

View file

@ -0,0 +1,57 @@
import polars as pl
from pipeline.utils.nspl_schema import CODE_COL_PATTERN, code_col_overrides
def test_matches_current_and_renamed_suffixes():
names = ["pcd", "ruc21ind", "oac11ind", "imd20ind", "lat", "long"]
overrides = code_col_overrides(names)
assert overrides == {
"ruc21ind": pl.String,
"oac11ind": pl.String,
"imd20ind": pl.String,
}
def test_catches_future_suffix_bump():
# The regression: ONS bumps imd20ind -> imd25ind. A hard-coded dict would
# no-op here; the stem match must still catch it.
names = ["pcd", "ruc25ind", "oac21ind", "imd25ind"]
overrides = code_col_overrides(names)
assert set(overrides) == {"ruc25ind", "oac21ind", "imd25ind"}
assert all(dtype is pl.String for dtype in overrides.values())
def test_is_case_insensitive():
overrides = code_col_overrides(["RUC21IND", "Oac11Ind", "IMD20ind"])
assert set(overrides) == {"RUC21IND", "Oac11Ind", "IMD20ind"}
def test_matches_suffixless_stem():
assert set(code_col_overrides(["rucind", "oacind", "imdind"])) == {
"rucind",
"oacind",
"imdind",
}
def test_ignores_unrelated_columns():
names = [
"pcd",
"laua",
"imdscore", # not an *ind column
"indicator", # ends in nothing relevant, no ruc/oac/imd stem
"ruc21indx", # extra trailing char -> not a code column
"xruc21ind", # leading char -> not anchored at start
]
assert code_col_overrides(names) == {}
def test_empty_names():
assert code_col_overrides([]) == {}
def test_pattern_anchored():
assert CODE_COL_PATTERN.match("imd25ind")
assert not CODE_COL_PATTERN.match("oac11indicator")
assert not CODE_COL_PATTERN.match("region_imd20ind")