This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -352,6 +352,50 @@ def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
return failures
def _failures_for_postcode_universe(spec: str) -> list[str]:
"""Validate that a postcode-features parquet's postcode set is exactly the
active-English NSPL/ArcGIS universe. Guards against a truncated or stale
postcode.parquet (e.g. an interrupted merge that wrote only a fraction of the
~1.49M rows, all otherwise valid) silently passing the build gate, since
`_failures_for_postcode_features` only checks per-row validity, not the count.
"""
arcgis_path, postcodes_path = _split_pair(spec, "postcode universe")
failures = _failures_for_parquet(arcgis_path) + _failures_for_parquet(
postcodes_path
)
if failures:
return failures
try:
active = _active_english_arcgis_postcodes(arcgis_path)
got = _parquet_postcodes(postcodes_path)
except Exception as exc:
return [
f"{arcgis_path} / {postcodes_path}: postcode universe check failed: {exc}"
]
failures = []
if len(got) != len(active):
failures.append(
f"{postcodes_path}: postcode count {len(got):,} != active-English NSPL "
f"universe {len(active):,} (from {arcgis_path})"
)
missing = active - got
extra = got - active
if missing:
failures.append(
f"{postcodes_path}: {len(missing):,} active English postcodes from "
f"{arcgis_path} are missing; sample: {_sample(missing)}"
)
if extra:
failures.append(
f"{postcodes_path}: {len(extra):,} postcodes are not active English "
f"postcodes in {arcgis_path}; sample: {_sample(extra)}"
)
return failures
def _failures_for_postcode_features(path: Path) -> list[str]:
"""Validate the postcode feature output: unique Postcode, non-null lat/lon
inside the England bbox, ctry25cd == E92000001, and every '% ' column in
@ -565,6 +609,15 @@ def main() -> int:
"lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
),
)
parser.add_argument(
"--postcode-universe",
action="append",
default=[],
help=(
"Require postcode parquet keys to equal the active-English NSPL "
"universe: ARCGIS::POSTCODES"
),
)
parser.add_argument(
"--properties-subset",
action="append",
@ -599,6 +652,8 @@ def main() -> int:
failures.extend(_failures_for_active_postcode_boundary_match(spec))
for path in args.postcode_features:
failures.extend(_failures_for_postcode_features(path))
for spec in args.postcode_universe:
failures.extend(_failures_for_postcode_universe(spec))
for spec in args.properties_subset:
failures.extend(_failures_for_properties_subset(spec))
for path in args.price_index: