perfect-postcode/pipeline/utils/nspl_schema.py

"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.

NSPL/NSUL embed the source year in classification-index column names
(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
release. These columns look numeric in early rows but contain string codes
like "UN1" (Unclassified) further down, so they must be forced to String
before scanning — otherwise polars infers Int64 and crashes mid-stream.

Hard-coding the year suffix is fragile: polars silently ignores overrides for
columns that don't exist, so a renamed suffix would no-op and reintroduce the
crash. Match on the suffix-free stem instead.
"""

import re

import polars as pl

# Matches ruc/oac/imd classification-index columns regardless of year suffix:
# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)


def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
    """Build a String schema-override dict for NSPL/NSUL code columns.

    Given the column names of an NSPL/NSUL CSV, return overrides forcing every
    RUC/OAC/IMD classification-index column to String, independent of the year
    suffix ONS happens to use this release.
    """
    return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}