idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -5,6 +5,7 @@ from .fuzzy_join import (
|
|||
normalize_postcode_key,
|
||||
)
|
||||
from .haversine import haversine_km, haversine_km_expr
|
||||
from .nspl_schema import code_col_overrides
|
||||
from .poi_counts import count_pois_per_postcode
|
||||
from .postcode_mapping import build_postcode_mapping
|
||||
|
||||
|
|
@ -16,6 +17,7 @@ __all__ = [
|
|||
"normalize_postcode_key",
|
||||
"haversine_km",
|
||||
"haversine_km_expr",
|
||||
"code_col_overrides",
|
||||
"count_pois_per_postcode",
|
||||
"build_postcode_mapping",
|
||||
]
|
||||
|
|
|
|||
30
pipeline/utils/nspl_schema.py
Normal file
30
pipeline/utils/nspl_schema.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
"""Suffix-agnostic schema overrides for ONS NSPL/NSUL classification columns.
|
||||
|
||||
NSPL/NSUL embed the source year in classification-index column names
|
||||
(e.g. ruc21ind, oac11ind, imd20ind) and ONS bumps those suffixes with each
|
||||
release. These columns look numeric in early rows but contain string codes
|
||||
like "UN1" (Unclassified) further down, so they must be forced to String
|
||||
before scanning — otherwise polars infers Int64 and crashes mid-stream.
|
||||
|
||||
Hard-coding the year suffix is fragile: polars silently ignores overrides for
|
||||
columns that don't exist, so a renamed suffix would no-op and reintroduce the
|
||||
crash. Match on the suffix-free stem instead.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Matches ruc/oac/imd classification-index columns regardless of year suffix:
|
||||
# ruc21ind, oac11ind, imd20ind, imd25ind, oacind, ... (case-insensitive).
|
||||
CODE_COL_PATTERN = re.compile(r"^(ruc|oac|imd)\d*ind$", re.IGNORECASE)
|
||||
|
||||
|
||||
def code_col_overrides(names: list[str]) -> dict[str, type[pl.String]]:
|
||||
"""Build a String schema-override dict for NSPL/NSUL code columns.
|
||||
|
||||
Given the column names of an NSPL/NSUL CSV, return overrides forcing every
|
||||
RUC/OAC/IMD classification-index column to String, independent of the year
|
||||
suffix ONS happens to use this release.
|
||||
"""
|
||||
return {name: pl.String for name in names if CODE_COL_PATTERN.match(name)}
|
||||
57
pipeline/utils/test_nspl_schema.py
Normal file
57
pipeline/utils/test_nspl_schema.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils.nspl_schema import CODE_COL_PATTERN, code_col_overrides
|
||||
|
||||
|
||||
def test_matches_current_and_renamed_suffixes():
|
||||
names = ["pcd", "ruc21ind", "oac11ind", "imd20ind", "lat", "long"]
|
||||
overrides = code_col_overrides(names)
|
||||
assert overrides == {
|
||||
"ruc21ind": pl.String,
|
||||
"oac11ind": pl.String,
|
||||
"imd20ind": pl.String,
|
||||
}
|
||||
|
||||
|
||||
def test_catches_future_suffix_bump():
|
||||
# The regression: ONS bumps imd20ind -> imd25ind. A hard-coded dict would
|
||||
# no-op here; the stem match must still catch it.
|
||||
names = ["pcd", "ruc25ind", "oac21ind", "imd25ind"]
|
||||
overrides = code_col_overrides(names)
|
||||
assert set(overrides) == {"ruc25ind", "oac21ind", "imd25ind"}
|
||||
assert all(dtype is pl.String for dtype in overrides.values())
|
||||
|
||||
|
||||
def test_is_case_insensitive():
|
||||
overrides = code_col_overrides(["RUC21IND", "Oac11Ind", "IMD20ind"])
|
||||
assert set(overrides) == {"RUC21IND", "Oac11Ind", "IMD20ind"}
|
||||
|
||||
|
||||
def test_matches_suffixless_stem():
|
||||
assert set(code_col_overrides(["rucind", "oacind", "imdind"])) == {
|
||||
"rucind",
|
||||
"oacind",
|
||||
"imdind",
|
||||
}
|
||||
|
||||
|
||||
def test_ignores_unrelated_columns():
|
||||
names = [
|
||||
"pcd",
|
||||
"laua",
|
||||
"imdscore", # not an *ind column
|
||||
"indicator", # ends in nothing relevant, no ruc/oac/imd stem
|
||||
"ruc21indx", # extra trailing char -> not a code column
|
||||
"xruc21ind", # leading char -> not anchored at start
|
||||
]
|
||||
assert code_col_overrides(names) == {}
|
||||
|
||||
|
||||
def test_empty_names():
|
||||
assert code_col_overrides([]) == {}
|
||||
|
||||
|
||||
def test_pattern_anchored():
|
||||
assert CODE_COL_PATTERN.match("imd25ind")
|
||||
assert not CODE_COL_PATTERN.match("oac11indicator")
|
||||
assert not CODE_COL_PATTERN.match("region_imd20ind")
|
||||
Loading…
Add table
Add a link
Reference in a new issue