idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/utils/nspl_schema.py
+++ b/pipeline/utils/nspl_schema.py
@ -0,0 +1,30 @@
+"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
+
+NSPL/NSUL embed the source year in classification-index column names
+(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
+release. These columns look numeric in early rows but contain string codes
+like "UN1" (Unclassified) further down, so they must be forced to String
+before scanning — otherwise polars infers Int64 and crashes mid-stream.
+
+Hard-coding the year suffix is fragile: polars silently ignores overrides for
+columns that don't exist, so a renamed suffix would no-op and reintroduce the
+crash. Match on the suffix-free stem instead.
+"""
+
+import re
+
+import polars as pl
+
+# Matches ruc/oac/imd classification-index columns regardless of year suffix:
+# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
+CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
+
+
+def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
+    """Build a String schema-override dict for NSPL/NSUL code columns.
+
+    Given the column names of an NSPL/NSUL CSV, return overrides forcing every
+    RUC/OAC/IMD classification-index column to String, independent of the year
+    suffix ONS happens to use this release.
+    """
+    return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}