30 lines
1.3 KiB
Python
30 lines
1.3 KiB
Python
"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
|
|
|
|
NSPL/NSUL embed the source year in classification-index column names
|
|
(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
|
|
release. These columns look numeric in early rows but contain string codes
|
|
like "UN1" (Unclassified) further down, so they must be forced to String
|
|
before scanning — otherwise polars infers Int64 and crashes mid-stream.
|
|
|
|
Hard-coding the year suffix is fragile: polars silently ignores overrides for
|
|
columns that don't exist, so a renamed suffix would no-op and reintroduce the
|
|
crash. Match on the suffix-free stem instead.
|
|
"""
|
|
|
|
import re
|
|
|
|
import polars as pl
|
|
|
|
# Matches ruc/oac/imd classification-index columns regardless of year suffix:
|
|
# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
|
|
CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
|
|
|
|
|
|
def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
|
|
"""Build a String schema-override dict for NSPL/NSUL code columns.
|
|
|
|
Given the column names of an NSPL/NSUL CSV, return overrides forcing every
|
|
RUC/OAC/IMD classification-index column to String, independent of the year
|
|
suffix ONS happens to use this release.
|
|
"""
|
|
return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}
|