scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -3,6 +3,7 @@
from __future__ import annotations
import argparse
import json
import sys
import zipfile
from pathlib import Path
@ -76,6 +77,24 @@ def _split_glob(spec: str) -> tuple[Path, str]:
return Path(base), pattern
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
if "::" not in spec:
raise argparse.ArgumentTypeError(
f"{spec!r} must use LEFT::RIGHT for {label}"
)
left, right = spec.split("::", 1)
if not left or not right:
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
return Path(left), Path(right)
def _canonical_postcode(value: object) -> str:
compact = "".join(str(value).split()).upper()
if len(compact) >= 5:
return f"{compact[:-3]} {compact[-3:]}"
return compact
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
base, pattern = _split_glob(spec)
if not base.exists():
@ -105,6 +124,79 @@ def _failures_for_zip_glob(spec: str) -> list[str]:
return failures
def _postcode_column(columns: list[str]) -> str | None:
for name in ("postcode", "Postcode", "pcds", "PCDS"):
if name in columns:
return name
return None
def _parquet_postcodes(path: Path) -> set[str]:
schema = pl.scan_parquet(path).collect_schema()
column = _postcode_column(schema.names())
if column is None:
raise ValueError(f"{path}: missing postcode column")
values = (
pl.scan_parquet(path)
.select(pl.col(column).drop_nulls().unique())
.collect()
.get_column(column)
.to_list()
)
return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
def _boundary_postcodes(path: Path) -> set[str]:
units_dir = path / "units" if (path / "units").is_dir() else path
postcodes: set[str] = set()
for geojson_path in sorted(units_dir.glob("*.geojson")):
with geojson_path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
for feature in data.get("features", []):
properties = feature.get("properties") or {}
value = properties.get("postcodes")
if value is not None:
postcode = _canonical_postcode(value)
if postcode:
postcodes.add(postcode)
return postcodes
def _sample(values: set[str]) -> str:
return ", ".join(sorted(values)[:10])
def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
if failures:
return failures
try:
parquet_postcodes = _parquet_postcodes(parquet_path)
boundary_postcodes = _boundary_postcodes(boundaries_path)
except Exception as exc:
return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
failures = []
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
missing_boundaries = parquet_postcodes - boundary_postcodes
orphan_boundaries = boundary_postcodes - parquet_postcodes
if missing_boundaries:
failures.append(
f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
f"are missing boundaries; sample: {_sample(missing_boundaries)}"
)
if orphan_boundaries:
failures.append(
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
)
return failures
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file", action="append", default=[], type=Path)
@ -123,6 +215,12 @@ def main() -> int:
default=[],
help="Require at least one readable zip matching BASE::PATTERN",
)
parser.add_argument(
"--postcode-boundary-match",
action="append",
default=[],
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
)
args = parser.parse_args()
failures: list[str] = []
@ -138,6 +236,8 @@ def main() -> int:
failures.extend(_failures_for_glob(spec))
for spec in args.zip_glob:
failures.extend(_failures_for_zip_glob(spec))
for spec in args.postcode_boundary_match:
failures.extend(_failures_for_postcode_boundary_match(spec))
if failures:
print("Output validation failed:", file=sys.stderr)