improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -9,6 +9,8 @@ import zipfile
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely.geometry import shape
|
||||
from shapely.validation import explain_validity
|
||||
|
||||
|
||||
def _failures_for_file(path: Path) -> list[str]:
|
||||
|
|
@ -79,9 +81,7 @@ def _split_glob(spec: str) -> tuple[Path, str]:
|
|||
|
||||
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
|
||||
if "::" not in spec:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"{spec!r} must use LEFT::RIGHT for {label}"
|
||||
)
|
||||
raise argparse.ArgumentTypeError(f"{spec!r} must use LEFT::RIGHT for {label}")
|
||||
left, right = spec.split("::", 1)
|
||||
if not left or not right:
|
||||
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
|
||||
|
|
@ -143,22 +143,140 @@ def _parquet_postcodes(path: Path) -> set[str]:
|
|||
.get_column(column)
|
||||
.to_list()
|
||||
)
|
||||
return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
|
||||
return {
|
||||
_canonical_postcode(value) for value in values if _canonical_postcode(value)
|
||||
}
|
||||
|
||||
|
||||
def _active_english_arcgis_postcodes(path: Path) -> set[str]:
|
||||
schema = pl.scan_parquet(path).collect_schema()
|
||||
required = {"pcds", "ctry25cd", "doterm"}
|
||||
missing = sorted(required - set(schema.names()))
|
||||
if missing:
|
||||
raise ValueError(f"{path}: missing ArcGIS postcode columns: {missing}")
|
||||
values = (
|
||||
pl.read_parquet(path, columns=["pcds", "ctry25cd", "doterm"])
|
||||
.lazy()
|
||||
.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
|
||||
.select(pl.col("pcds").drop_nulls().unique())
|
||||
.collect()
|
||||
.get_column("pcds")
|
||||
.to_list()
|
||||
)
|
||||
return {
|
||||
_canonical_postcode(value) for value in values if _canonical_postcode(value)
|
||||
}
|
||||
|
||||
|
||||
def _format_samples(samples: list[str]) -> str:
|
||||
return "; ".join(samples[:10])
|
||||
|
||||
|
||||
def _boundary_postcode_scan(path: Path) -> tuple[set[str], list[str]]:
|
||||
units_dir = path / "units" if (path / "units").is_dir() else path
|
||||
postcodes: set[str] = set()
|
||||
seen: dict[str, str] = {}
|
||||
failures: list[str] = []
|
||||
missing_postcode_samples: list[str] = []
|
||||
missing_geometry_samples: list[str] = []
|
||||
non_polygon_samples: list[str] = []
|
||||
invalid_geometry_samples: list[str] = []
|
||||
duplicate_samples: list[str] = []
|
||||
missing_postcode_count = 0
|
||||
missing_geometry_count = 0
|
||||
non_polygon_count = 0
|
||||
invalid_geometry_count = 0
|
||||
duplicate_count = 0
|
||||
|
||||
for geojson_path in sorted(units_dir.glob("*.geojson")):
|
||||
try:
|
||||
with geojson_path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
except Exception as exc:
|
||||
failures.append(f"{geojson_path}: unreadable GeoJSON: {exc}")
|
||||
continue
|
||||
|
||||
for idx, feature in enumerate(data.get("features", [])):
|
||||
label = f"{geojson_path.name} feature {idx}"
|
||||
properties = feature.get("properties") or {}
|
||||
value = properties.get("postcodes")
|
||||
postcode = _canonical_postcode(value) if value is not None else ""
|
||||
if not postcode:
|
||||
missing_postcode_count += 1
|
||||
if len(missing_postcode_samples) < 10:
|
||||
missing_postcode_samples.append(label)
|
||||
else:
|
||||
if postcode in seen:
|
||||
duplicate_count += 1
|
||||
if len(duplicate_samples) < 10:
|
||||
duplicate_samples.append(
|
||||
f"{postcode} in {seen[postcode]} and {label}"
|
||||
)
|
||||
else:
|
||||
seen[postcode] = label
|
||||
postcodes.add(postcode)
|
||||
|
||||
geometry_data = feature.get("geometry")
|
||||
if geometry_data is None:
|
||||
missing_geometry_count += 1
|
||||
if len(missing_geometry_samples) < 10:
|
||||
missing_geometry_samples.append(f"{postcode or label}")
|
||||
continue
|
||||
try:
|
||||
geom = shape(geometry_data)
|
||||
except Exception as exc:
|
||||
invalid_geometry_count += 1
|
||||
if len(invalid_geometry_samples) < 10:
|
||||
invalid_geometry_samples.append(f"{postcode or label}: {exc}")
|
||||
continue
|
||||
if geom.is_empty:
|
||||
missing_geometry_count += 1
|
||||
if len(missing_geometry_samples) < 10:
|
||||
missing_geometry_samples.append(f"{postcode or label}: empty")
|
||||
elif geom.geom_type not in {"Polygon", "MultiPolygon"}:
|
||||
non_polygon_count += 1
|
||||
if len(non_polygon_samples) < 10:
|
||||
non_polygon_samples.append(f"{postcode or label}: {geom.geom_type}")
|
||||
elif not geom.is_valid:
|
||||
invalid_geometry_count += 1
|
||||
if len(invalid_geometry_samples) < 10:
|
||||
invalid_geometry_samples.append(
|
||||
f"{postcode or label}: {explain_validity(geom)}"
|
||||
)
|
||||
|
||||
if missing_postcode_count:
|
||||
failures.append(
|
||||
f"{path}: {missing_postcode_count:,} boundary features are missing "
|
||||
f"properties.postcodes; sample: {_format_samples(missing_postcode_samples)}"
|
||||
)
|
||||
if duplicate_count:
|
||||
failures.append(
|
||||
f"{path}: {duplicate_count:,} duplicate boundary postcode features; "
|
||||
f"sample: {_format_samples(duplicate_samples)}"
|
||||
)
|
||||
if missing_geometry_count:
|
||||
failures.append(
|
||||
f"{path}: {missing_geometry_count:,} boundary features are missing or empty "
|
||||
f"geometry; sample: {_format_samples(missing_geometry_samples)}"
|
||||
)
|
||||
if non_polygon_count:
|
||||
failures.append(
|
||||
f"{path}: {non_polygon_count:,} boundary features are not polygonal; "
|
||||
f"sample: {_format_samples(non_polygon_samples)}"
|
||||
)
|
||||
if invalid_geometry_count:
|
||||
failures.append(
|
||||
f"{path}: {invalid_geometry_count:,} invalid boundary geometries; "
|
||||
f"sample: {_format_samples(invalid_geometry_samples)}"
|
||||
)
|
||||
return postcodes, failures
|
||||
|
||||
|
||||
def _boundary_postcodes(path: Path) -> set[str]:
|
||||
units_dir = path / "units" if (path / "units").is_dir() else path
|
||||
postcodes: set[str] = set()
|
||||
for geojson_path in sorted(units_dir.glob("*.geojson")):
|
||||
with geojson_path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
for feature in data.get("features", []):
|
||||
properties = feature.get("properties") or {}
|
||||
value = properties.get("postcodes")
|
||||
if value is not None:
|
||||
postcode = _canonical_postcode(value)
|
||||
if postcode:
|
||||
postcodes.add(postcode)
|
||||
postcodes, failures = _boundary_postcode_scan(path)
|
||||
if failures:
|
||||
raise ValueError("; ".join(failures))
|
||||
return postcodes
|
||||
|
||||
|
||||
|
|
@ -174,11 +292,13 @@ def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
|
|||
|
||||
try:
|
||||
parquet_postcodes = _parquet_postcodes(parquet_path)
|
||||
boundary_postcodes = _boundary_postcodes(boundaries_path)
|
||||
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
|
||||
except Exception as exc:
|
||||
return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
|
||||
return [
|
||||
f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"
|
||||
]
|
||||
|
||||
failures = []
|
||||
failures = list(boundary_failures)
|
||||
if not boundary_postcodes:
|
||||
failures.append(f"{boundaries_path}: no boundary postcodes found")
|
||||
|
||||
|
|
@ -197,6 +317,41 @@ def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
|
|||
return failures
|
||||
|
||||
|
||||
def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
|
||||
arcgis_path, boundaries_path = _split_pair(
|
||||
spec, "active postcode boundary matching"
|
||||
)
|
||||
failures = _failures_for_parquet(arcgis_path) + _failures_for_dir(boundaries_path)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
active_postcodes = _active_english_arcgis_postcodes(arcgis_path)
|
||||
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
|
||||
except Exception as exc:
|
||||
return [
|
||||
f"{arcgis_path} / {boundaries_path}: active postcode boundary check failed: {exc}"
|
||||
]
|
||||
|
||||
failures = list(boundary_failures)
|
||||
if not boundary_postcodes:
|
||||
failures.append(f"{boundaries_path}: no boundary postcodes found")
|
||||
|
||||
missing_boundaries = active_postcodes - boundary_postcodes
|
||||
orphan_boundaries = boundary_postcodes - active_postcodes
|
||||
if missing_boundaries:
|
||||
failures.append(
|
||||
f"{boundaries_path}: {len(missing_boundaries):,} active English postcodes "
|
||||
f"from {arcgis_path} are missing boundaries; sample: {_sample(missing_boundaries)}"
|
||||
)
|
||||
if orphan_boundaries:
|
||||
failures.append(
|
||||
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are not "
|
||||
f"active English postcodes in {arcgis_path}; sample: {_sample(orphan_boundaries)}"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--file", action="append", default=[], type=Path)
|
||||
|
|
@ -221,6 +376,15 @@ def main() -> int:
|
|||
default=[],
|
||||
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--active-postcode-boundary-match",
|
||||
action="append",
|
||||
default=[],
|
||||
help=(
|
||||
"Require active English ArcGIS postcodes to exactly match boundary "
|
||||
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
failures: list[str] = []
|
||||
|
|
@ -238,6 +402,8 @@ def main() -> int:
|
|||
failures.extend(_failures_for_zip_glob(spec))
|
||||
for spec in args.postcode_boundary_match:
|
||||
failures.extend(_failures_for_postcode_boundary_match(spec))
|
||||
for spec in args.active_postcode_boundary_match:
|
||||
failures.extend(_failures_for_active_postcode_boundary_match(spec))
|
||||
|
||||
if failures:
|
||||
print("Output validation failed:", file=sys.stderr)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue