perfect-postcode/pipeline/validate_outputs.py
2026-05-31 20:20:41 +01:00

418 lines
14 KiB
Python

"""Validate pipeline outputs before Make stamps are touched."""
from __future__ import annotations
import argparse
import json
import sys
import zipfile
from pathlib import Path
import polars as pl
from shapely.geometry import shape
from shapely.validation import explain_validity
def _failures_for_file(path: Path) -> list[str]:
if not path.exists():
return [f"{path}: missing"]
if not path.is_file():
return [f"{path}: not a file"]
try:
size = path.stat().st_size
except OSError as exc:
return [f"{path}: unreadable metadata: {exc}"]
if size <= 0:
return [f"{path}: empty file"]
return []
def _failures_for_dir(path: Path) -> list[str]:
if not path.exists():
return [f"{path}: missing"]
if not path.is_dir():
return [f"{path}: not a directory"]
try:
if not any(not child.name.startswith(".") for child in path.iterdir()):
return [f"{path}: empty directory"]
except OSError as exc:
return [f"{path}: unreadable directory: {exc}"]
return []
def _failures_for_parquet(path: Path) -> list[str]:
failures = _failures_for_file(path)
if failures:
return failures
try:
row_count = pl.scan_parquet(path).select(pl.len()).collect().item()
except Exception as exc:
return [f"{path}: unreadable parquet: {exc}"]
if row_count <= 0:
return [f"{path}: parquet has no rows"]
return []
def _failures_for_zip(path: Path) -> list[str]:
failures = _failures_for_file(path)
if failures:
return failures
if not zipfile.is_zipfile(path):
return [f"{path}: unreadable zip"]
try:
with zipfile.ZipFile(path) as archive:
if not archive.namelist():
return [f"{path}: zip has no members"]
except Exception as exc:
return [f"{path}: unreadable zip: {exc}"]
return []
def _split_glob(spec: str) -> tuple[Path, str]:
if "::" not in spec:
raise argparse.ArgumentTypeError(
f"{spec!r} must use BASE::PATTERN, for example data::**/*.csv"
)
base, pattern = spec.split("::", 1)
if not base or not pattern:
raise argparse.ArgumentTypeError(f"{spec!r} must include BASE and PATTERN")
return Path(base), pattern
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
if "::" not in spec:
raise argparse.ArgumentTypeError(f"{spec!r} must use LEFT::RIGHT for {label}")
left, right = spec.split("::", 1)
if not left or not right:
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
return Path(left), Path(right)
def _canonical_postcode(value: object) -> str:
compact = "".join(str(value).split()).upper()
if len(compact) >= 5:
return f"{compact[:-3]} {compact[-3:]}"
return compact
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
base, pattern = _split_glob(spec)
if not base.exists():
return base, pattern, []
return base, pattern, sorted(path for path in base.glob(pattern) if path.is_file())
def _failures_for_glob(spec: str) -> list[str]:
base, pattern, paths = _matched_files(spec)
if not paths:
return [f"{base}: no files matched {pattern!r}"]
failures: list[str] = []
for path in paths:
failures.extend(_failures_for_file(path))
return failures
def _failures_for_zip_glob(spec: str) -> list[str]:
base, pattern, paths = _matched_files(spec)
if not paths:
return [f"{base}: no zip files matched {pattern!r}"]
failures: list[str] = []
for path in paths:
failures.extend(_failures_for_zip(path))
return failures
def _postcode_column(columns: list[str]) -> str | None:
for name in ("postcode", "Postcode", "pcds", "PCDS"):
if name in columns:
return name
return None
def _parquet_postcodes(path: Path) -> set[str]:
schema = pl.scan_parquet(path).collect_schema()
column = _postcode_column(schema.names())
if column is None:
raise ValueError(f"{path}: missing postcode column")
values = (
pl.scan_parquet(path)
.select(pl.col(column).drop_nulls().unique())
.collect()
.get_column(column)
.to_list()
)
return {
_canonical_postcode(value) for value in values if _canonical_postcode(value)
}
def _active_english_arcgis_postcodes(path: Path) -> set[str]:
schema = pl.scan_parquet(path).collect_schema()
required = {"pcds", "ctry25cd", "doterm"}
missing = sorted(required - set(schema.names()))
if missing:
raise ValueError(f"{path}: missing ArcGIS postcode columns: {missing}")
values = (
pl.read_parquet(path, columns=["pcds", "ctry25cd", "doterm"])
.lazy()
.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
.select(pl.col("pcds").drop_nulls().unique())
.collect()
.get_column("pcds")
.to_list()
)
return {
_canonical_postcode(value) for value in values if _canonical_postcode(value)
}
def _format_samples(samples: list[str]) -> str:
return "; ".join(samples[:10])
def _boundary_postcode_scan(path: Path) -> tuple[set[str], list[str]]:
units_dir = path / "units" if (path / "units").is_dir() else path
postcodes: set[str] = set()
seen: dict[str, str] = {}
failures: list[str] = []
missing_postcode_samples: list[str] = []
missing_geometry_samples: list[str] = []
non_polygon_samples: list[str] = []
invalid_geometry_samples: list[str] = []
duplicate_samples: list[str] = []
missing_postcode_count = 0
missing_geometry_count = 0
non_polygon_count = 0
invalid_geometry_count = 0
duplicate_count = 0
for geojson_path in sorted(units_dir.glob("*.geojson")):
try:
with geojson_path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except Exception as exc:
failures.append(f"{geojson_path}: unreadable GeoJSON: {exc}")
continue
for idx, feature in enumerate(data.get("features", [])):
label = f"{geojson_path.name} feature {idx}"
properties = feature.get("properties") or {}
value = properties.get("postcodes")
postcode = _canonical_postcode(value) if value is not None else ""
if not postcode:
missing_postcode_count += 1
if len(missing_postcode_samples) < 10:
missing_postcode_samples.append(label)
else:
if postcode in seen:
duplicate_count += 1
if len(duplicate_samples) < 10:
duplicate_samples.append(
f"{postcode} in {seen[postcode]} and {label}"
)
else:
seen[postcode] = label
postcodes.add(postcode)
geometry_data = feature.get("geometry")
if geometry_data is None:
missing_geometry_count += 1
if len(missing_geometry_samples) < 10:
missing_geometry_samples.append(f"{postcode or label}")
continue
try:
geom = shape(geometry_data)
except Exception as exc:
invalid_geometry_count += 1
if len(invalid_geometry_samples) < 10:
invalid_geometry_samples.append(f"{postcode or label}: {exc}")
continue
if geom.is_empty:
missing_geometry_count += 1
if len(missing_geometry_samples) < 10:
missing_geometry_samples.append(f"{postcode or label}: empty")
elif geom.geom_type not in {"Polygon", "MultiPolygon"}:
non_polygon_count += 1
if len(non_polygon_samples) < 10:
non_polygon_samples.append(f"{postcode or label}: {geom.geom_type}")
elif not geom.is_valid:
invalid_geometry_count += 1
if len(invalid_geometry_samples) < 10:
invalid_geometry_samples.append(
f"{postcode or label}: {explain_validity(geom)}"
)
if missing_postcode_count:
failures.append(
f"{path}: {missing_postcode_count:,} boundary features are missing "
f"properties.postcodes; sample: {_format_samples(missing_postcode_samples)}"
)
if duplicate_count:
failures.append(
f"{path}: {duplicate_count:,} duplicate boundary postcode features; "
f"sample: {_format_samples(duplicate_samples)}"
)
if missing_geometry_count:
failures.append(
f"{path}: {missing_geometry_count:,} boundary features are missing or empty "
f"geometry; sample: {_format_samples(missing_geometry_samples)}"
)
if non_polygon_count:
failures.append(
f"{path}: {non_polygon_count:,} boundary features are not polygonal; "
f"sample: {_format_samples(non_polygon_samples)}"
)
if invalid_geometry_count:
failures.append(
f"{path}: {invalid_geometry_count:,} invalid boundary geometries; "
f"sample: {_format_samples(invalid_geometry_samples)}"
)
return postcodes, failures
def _boundary_postcodes(path: Path) -> set[str]:
postcodes, failures = _boundary_postcode_scan(path)
if failures:
raise ValueError("; ".join(failures))
return postcodes
def _sample(values: set[str]) -> str:
return ", ".join(sorted(values)[:10])
def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
if failures:
return failures
try:
parquet_postcodes = _parquet_postcodes(parquet_path)
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
except Exception as exc:
return [
f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"
]
failures = list(boundary_failures)
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
missing_boundaries = parquet_postcodes - boundary_postcodes
orphan_boundaries = boundary_postcodes - parquet_postcodes
if missing_boundaries:
failures.append(
f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
f"are missing boundaries; sample: {_sample(missing_boundaries)}"
)
if orphan_boundaries:
failures.append(
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
)
return failures
def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
arcgis_path, boundaries_path = _split_pair(
spec, "active postcode boundary matching"
)
failures = _failures_for_parquet(arcgis_path) + _failures_for_dir(boundaries_path)
if failures:
return failures
try:
active_postcodes = _active_english_arcgis_postcodes(arcgis_path)
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
except Exception as exc:
return [
f"{arcgis_path} / {boundaries_path}: active postcode boundary check failed: {exc}"
]
failures = list(boundary_failures)
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
missing_boundaries = active_postcodes - boundary_postcodes
orphan_boundaries = boundary_postcodes - active_postcodes
if missing_boundaries:
failures.append(
f"{boundaries_path}: {len(missing_boundaries):,} active English postcodes "
f"from {arcgis_path} are missing boundaries; sample: {_sample(missing_boundaries)}"
)
if orphan_boundaries:
failures.append(
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are not "
f"active English postcodes in {arcgis_path}; sample: {_sample(orphan_boundaries)}"
)
return failures
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file", action="append", default=[], type=Path)
parser.add_argument("--dir", action="append", default=[], type=Path)
parser.add_argument("--parquet", action="append", default=[], type=Path)
parser.add_argument("--zip", action="append", default=[], type=Path)
parser.add_argument(
"--glob",
action="append",
default=[],
help="Require at least one non-empty file matching BASE::PATTERN",
)
parser.add_argument(
"--zip-glob",
action="append",
default=[],
help="Require at least one readable zip matching BASE::PATTERN",
)
parser.add_argument(
"--postcode-boundary-match",
action="append",
default=[],
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
)
parser.add_argument(
"--active-postcode-boundary-match",
action="append",
default=[],
help=(
"Require active English ArcGIS postcodes to exactly match boundary "
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
),
)
args = parser.parse_args()
failures: list[str] = []
for path in args.file:
failures.extend(_failures_for_file(path))
for path in args.dir:
failures.extend(_failures_for_dir(path))
for path in args.parquet:
failures.extend(_failures_for_parquet(path))
for path in args.zip:
failures.extend(_failures_for_zip(path))
for spec in args.glob:
failures.extend(_failures_for_glob(spec))
for spec in args.zip_glob:
failures.extend(_failures_for_zip_glob(spec))
for spec in args.postcode_boundary_match:
failures.extend(_failures_for_postcode_boundary_match(spec))
for spec in args.active_postcode_boundary_match:
failures.extend(_failures_for_active_postcode_boundary_match(spec))
if failures:
print("Output validation failed:", file=sys.stderr)
for failure in failures:
print(f" - {failure}", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())