idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
30
pipeline/utils/nspl_schema.py
Normal file
30
pipeline/utils/nspl_schema.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
|
||||
|
||||
NSPL/NSUL embed the source year in classification-index column names
|
||||
(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
|
||||
release. These columns look numeric in early rows but contain string codes
|
||||
like "UN1" (Unclassified) further down, so they must be forced to String
|
||||
before scanning — otherwise polars infers Int64 and crashes mid-stream.
|
||||
|
||||
Hard-coding the year suffix is fragile: polars silently ignores overrides for
|
||||
columns that don't exist, so a renamed suffix would no-op and reintroduce the
|
||||
crash. Match on the suffix-free stem instead.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Matches ruc/oac/imd classification-index columns regardless of year suffix:
|
||||
# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
|
||||
CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
|
||||
|
||||
|
||||
def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
|
||||
"""Build a String schema-override dict for NSPL/NSUL code columns.
|
||||
|
||||
Given the column names of an NSPL/NSUL CSV, return overrides forcing every
|
||||
RUC/OAC/IMD classification-index column to String, independent of the year
|
||||
suffix ONS happens to use this release.
|
||||
"""
|
||||
return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}
|
||||
Loading…
Add table
Add a link
Reference in a new issue