perfect-postcode/pipeline/validate_outputs.py

"""Validate pipeline outputs before Make stamps are touched."""

from __future__ import annotations

import argparse
import json
import sys
import zipfile
from pathlib import Path

import polars as pl
from shapely.geometry import shape
from shapely.validation import explain_validity


def _failures_for_file(path: Path) -> list[str]:
    if not path.exists():
        return [f"{path}: missing"]
    if not path.is_file():
        return [f"{path}: not a file"]
    try:
        size = path.stat().st_size
    except OSError as exc:
        return [f"{path}: unreadable metadata: {exc}"]
    if size <= 0:
        return [f"{path}: empty file"]
    return []


def _failures_for_dir(path: Path) -> list[str]:
    if not path.exists():
        return [f"{path}: missing"]
    if not path.is_dir():
        return [f"{path}: not a directory"]
    try:
        if not any(not child.name.startswith(".") for child in path.iterdir()):
            return [f"{path}: empty directory"]
    except OSError as exc:
        return [f"{path}: unreadable directory: {exc}"]
    return []


def _failures_for_parquet(path: Path) -> list[str]:
    failures = _failures_for_file(path)
    if failures:
        return failures
    try:
        row_count = pl.scan_parquet(path).select(pl.len()).collect().item()
    except Exception as exc:
        return [f"{path}: unreadable parquet: {exc}"]
    if row_count <= 0:
        return [f"{path}: parquet has no rows"]
    return []


def _failures_for_zip(path: Path) -> list[str]:
    failures = _failures_for_file(path)
    if failures:
        return failures
    if not zipfile.is_zipfile(path):
        return [f"{path}: unreadable zip"]
    try:
        with zipfile.ZipFile(path) as archive:
            if not archive.namelist():
                return [f"{path}: zip has no members"]
    except Exception as exc:
        return [f"{path}: unreadable zip: {exc}"]
    return []


def _split_glob(spec: str) -> tuple[Path, str]:
    if "::" not in spec:
        raise argparse.ArgumentTypeError(
            f"{spec!r} must use BASE::PATTERN, for example data::**/*.csv"
        )
    base, pattern = spec.split("::", 1)
    if not base or not pattern:
        raise argparse.ArgumentTypeError(f"{spec!r} must include BASE and PATTERN")
    return Path(base), pattern


def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
    if "::" not in spec:
        raise argparse.ArgumentTypeError(f"{spec!r} must use LEFT::RIGHT for {label}")
    left, right = spec.split("::", 1)
    if not left or not right:
        raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
    return Path(left), Path(right)


def _canonical_postcode(value: object) -> str:
    compact = "".join(str(value).split()).upper()
    if len(compact) >= 5:
        return f"{compact[:-3]} {compact[-3:]}"
    return compact


def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
    base, pattern = _split_glob(spec)
    if not base.exists():
        return base, pattern, []
    return base, pattern, sorted(path for path in base.glob(pattern) if path.is_file())


def _failures_for_glob(spec: str) -> list[str]:
    base, pattern, paths = _matched_files(spec)
    if not paths:
        return [f"{base}: no files matched {pattern!r}"]

    failures: list[str] = []
    for path in paths:
        failures.extend(_failures_for_file(path))
    return failures


def _failures_for_zip_glob(spec: str) -> list[str]:
    base, pattern, paths = _matched_files(spec)
    if not paths:
        return [f"{base}: no zip files matched {pattern!r}"]

    failures: list[str] = []
    for path in paths:
        failures.extend(_failures_for_zip(path))
    return failures


def _postcode_column(columns: list[str]) -> str | None:
    for name in ("postcode", "Postcode", "pcds", "PCDS"):
        if name in columns:
            return name
    return None


def _parquet_postcodes(path: Path) -> set[str]:
    schema = pl.scan_parquet(path).collect_schema()
    column = _postcode_column(schema.names())
    if column is None:
        raise ValueError(f"{path}: missing postcode column")
    values = (
        pl.scan_parquet(path)
        .select(pl.col(column).drop_nulls().unique())
        .collect()
        .get_column(column)
        .to_list()
    )
    return {
        _canonical_postcode(value) for value in values if _canonical_postcode(value)
    }


def _active_english_arcgis_postcodes(path: Path) -> set[str]:
    schema = pl.scan_parquet(path).collect_schema()
    required = {"pcds", "ctry25cd", "doterm"}
    missing = sorted(required - set(schema.names()))
    if missing:
        raise ValueError(f"{path}: missing ArcGIS postcode columns: {missing}")
    values = (
        pl.read_parquet(path, columns=["pcds", "ctry25cd", "doterm"])
        .lazy()
        .filter(pl.col("ctry25cd") == "E92000001")
        .filter(pl.col("doterm").cast(pl.Utf8).is_null())
        .select(pl.col("pcds").drop_nulls().unique())
        .collect()
        .get_column("pcds")
        .to_list()
    )
    return {
        _canonical_postcode(value) for value in values if _canonical_postcode(value)
    }


def _format_samples(samples: list[str]) -> str:
    return "; ".join(samples[:10])


def _boundary_postcode_scan(path: Path) -> tuple[set[str], list[str]]:
    units_dir = path / "units" if (path / "units").is_dir() else path
    postcodes: set[str] = set()
    seen: dict[str, str] = {}
    failures: list[str] = []
    missing_postcode_samples: list[str] = []
    missing_geometry_samples: list[str] = []
    non_polygon_samples: list[str] = []
    invalid_geometry_samples: list[str] = []
    duplicate_samples: list[str] = []
    missing_postcode_count = 0
    missing_geometry_count = 0
    non_polygon_count = 0
    invalid_geometry_count = 0
    duplicate_count = 0

    for geojson_path in sorted(units_dir.glob("*.geojson")):
        try:
            with geojson_path.open("r", encoding="utf-8") as handle:
                data = json.load(handle)
        except Exception as exc:
            failures.append(f"{geojson_path}: unreadable GeoJSON: {exc}")
            continue

        for idx, feature in enumerate(data.get("features", [])):
            label = f"{geojson_path.name} feature {idx}"
            properties = feature.get("properties") or {}
            value = properties.get("postcodes")
            postcode = _canonical_postcode(value) if value is not None else ""
            if not postcode:
                missing_postcode_count += 1
                if len(missing_postcode_samples) < 10:
                    missing_postcode_samples.append(label)
            else:
                if postcode in seen:
                    duplicate_count += 1
                    if len(duplicate_samples) < 10:
                        duplicate_samples.append(
                            f"{postcode} in {seen[postcode]} and {label}"
                        )
                else:
                    seen[postcode] = label
                postcodes.add(postcode)

            geometry_data = feature.get("geometry")
            if geometry_data is None:
                missing_geometry_count += 1
                if len(missing_geometry_samples) < 10:
                    missing_geometry_samples.append(f"{postcode or label}")
                continue
            try:
                geom = shape(geometry_data)
            except Exception as exc:
                invalid_geometry_count += 1
                if len(invalid_geometry_samples) < 10:
                    invalid_geometry_samples.append(f"{postcode or label}: {exc}")
                continue
            if geom.is_empty:
                missing_geometry_count += 1
                if len(missing_geometry_samples) < 10:
                    missing_geometry_samples.append(f"{postcode or label}: empty")
            elif geom.geom_type not in {"Polygon", "MultiPolygon"}:
                non_polygon_count += 1
                if len(non_polygon_samples) < 10:
                    non_polygon_samples.append(f"{postcode or label}: {geom.geom_type}")
            elif not geom.is_valid:
                invalid_geometry_count += 1
                if len(invalid_geometry_samples) < 10:
                    invalid_geometry_samples.append(
                        f"{postcode or label}: {explain_validity(geom)}"
                    )

    if missing_postcode_count:
        failures.append(
            f"{path}: {missing_postcode_count:,} boundary features are missing "
            f"properties.postcodes; sample: {_format_samples(missing_postcode_samples)}"
        )
    if duplicate_count:
        failures.append(
            f"{path}: {duplicate_count:,} duplicate boundary postcode features; "
            f"sample: {_format_samples(duplicate_samples)}"
        )
    if missing_geometry_count:
        failures.append(
            f"{path}: {missing_geometry_count:,} boundary features are missing or empty "
            f"geometry; sample: {_format_samples(missing_geometry_samples)}"
        )
    if non_polygon_count:
        failures.append(
            f"{path}: {non_polygon_count:,} boundary features are not polygonal; "
            f"sample: {_format_samples(non_polygon_samples)}"
        )
    if invalid_geometry_count:
        failures.append(
            f"{path}: {invalid_geometry_count:,} invalid boundary geometries; "
            f"sample: {_format_samples(invalid_geometry_samples)}"
        )
    return postcodes, failures


def _boundary_postcodes(path: Path) -> set[str]:
    postcodes, failures = _boundary_postcode_scan(path)
    if failures:
        raise ValueError("; ".join(failures))
    return postcodes


def _sample(values: set[str]) -> str:
    return ", ".join(sorted(values)[:10])


def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
    parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
    failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
    if failures:
        return failures

    try:
        parquet_postcodes = _parquet_postcodes(parquet_path)
        boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
    except Exception as exc:
        return [
            f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"
        ]

    failures = list(boundary_failures)
    if not boundary_postcodes:
        failures.append(f"{boundaries_path}: no boundary postcodes found")

    missing_boundaries = parquet_postcodes - boundary_postcodes
    orphan_boundaries = boundary_postcodes - parquet_postcodes
    if missing_boundaries:
        failures.append(
            f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
            f"are missing boundaries; sample: {_sample(missing_boundaries)}"
        )
    if orphan_boundaries:
        failures.append(
            f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
            f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
        )
    return failures


def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
    arcgis_path, boundaries_path = _split_pair(
        spec, "active postcode boundary matching"
    )
    failures = _failures_for_parquet(arcgis_path) + _failures_for_dir(boundaries_path)
    if failures:
        return failures

    try:
        active_postcodes = _active_english_arcgis_postcodes(arcgis_path)
        boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
    except Exception as exc:
        return [
            f"{arcgis_path} / {boundaries_path}: active postcode boundary check failed: {exc}"
        ]

    failures = list(boundary_failures)
    if not boundary_postcodes:
        failures.append(f"{boundaries_path}: no boundary postcodes found")

    missing_boundaries = active_postcodes - boundary_postcodes
    orphan_boundaries = boundary_postcodes - active_postcodes
    if missing_boundaries:
        failures.append(
            f"{boundaries_path}: {len(missing_boundaries):,} active English postcodes "
            f"from {arcgis_path} are missing boundaries; sample: {_sample(missing_boundaries)}"
        )
    if orphan_boundaries:
        failures.append(
            f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are not "
            f"active English postcodes in {arcgis_path}; sample: {_sample(orphan_boundaries)}"
        )
    return failures


def _failures_for_postcode_universe(spec: str) -> list[str]:
    """Validate that a postcode-features parquet's postcode set is exactly the
    active-English NSPL/ArcGIS universe. Guards against a truncated or stale
    postcode.parquet (e.g. an interrupted merge that wrote only a fraction of the
    ~1.49M rows, all otherwise valid) silently passing the build gate, since
    `_failures_for_postcode_features` only checks per-row validity, not the count.
    """
    arcgis_path, postcodes_path = _split_pair(spec, "postcode universe")
    failures = _failures_for_parquet(arcgis_path) + _failures_for_parquet(
        postcodes_path
    )
    if failures:
        return failures

    try:
        active = _active_english_arcgis_postcodes(arcgis_path)
        got = _parquet_postcodes(postcodes_path)
    except Exception as exc:
        return [
            f"{arcgis_path} / {postcodes_path}: postcode universe check failed: {exc}"
        ]

    failures = []
    if len(got) != len(active):
        failures.append(
            f"{postcodes_path}: postcode count {len(got):,} != active-English NSPL "
            f"universe {len(active):,} (from {arcgis_path})"
        )

    missing = active - got
    extra = got - active
    if missing:
        failures.append(
            f"{postcodes_path}: {len(missing):,} active English postcodes from "
            f"{arcgis_path} are missing; sample: {_sample(missing)}"
        )
    if extra:
        failures.append(
            f"{postcodes_path}: {len(extra):,} postcodes are not active English "
            f"postcodes in {arcgis_path}; sample: {_sample(extra)}"
        )
    return failures


def _failures_for_postcode_features(path: Path) -> list[str]:
    """Validate the postcode feature output: unique Postcode, non-null lat/lon
    inside the England bbox, ctry25cd == E92000001, and every '% ' column in
    [0, 100]. Mirrors the in-build invariant (merge._validate_postcode_feature_output)
    so a stale/contaminated file on disk cannot pass `make`.
    """
    failures = _failures_for_parquet(path)
    if failures:
        return failures

    try:
        names = pl.scan_parquet(path).collect_schema().names()
        required = {"Postcode", "lat", "lon", "ctry25cd"}
        missing = sorted(required - set(names))
        if missing:
            return [f"{path}: postcode features missing required columns: {missing}"]

        pct_cols = [c for c in names if c.startswith("% ")]
        df = (
            pl.scan_parquet(path)
            .select(["Postcode", "lat", "lon", "ctry25cd", *pct_cols])
            .collect()
        )
    except Exception as exc:
        return [f"{path}: postcode features validation failed: {exc}"]

    height = df.height
    if df["Postcode"].n_unique() != height:
        failures.append(
            f"{path}: Postcode is not unique "
            f"({height - df['Postcode'].n_unique():,} duplicate rows)"
        )

    # England bounding box (generous): lat 49.5-60N, lon -8 to 2.5E.
    bad_coords = df.filter(
        pl.col("lat").is_null()
        | pl.col("lon").is_null()
        | ~pl.col("lat").is_between(49.5, 60.0)
        | ~pl.col("lon").is_between(-8.0, 2.5)
    )
    if bad_coords.height:
        sample = bad_coords.get_column("Postcode").head(10).to_list()
        failures.append(
            f"{path}: {bad_coords.height:,} rows have null or out-of-England "
            f"lat/lon; sample: {_format_samples(sample)}"
        )

    bad_country = df.filter(pl.col("ctry25cd") != "E92000001")
    if bad_country.height:
        sample = bad_country.get_column("Postcode").head(10).to_list()
        failures.append(
            f"{path}: {bad_country.height:,} rows have ctry25cd != 'E92000001' "
            f"(non-England contamination); sample: {_format_samples(sample)}"
        )

    for col in pct_cols:
        out_of_range = df.filter(
            pl.col(col).is_not_null() & ~pl.col(col).is_between(0.0, 100.0)
        ).height
        if out_of_range:
            failures.append(
                f"{path}: {col!r} has {out_of_range:,} values outside [0, 100]"
            )

    return failures


def _failures_for_properties_subset(spec: str) -> list[str]:
    """Validate that every properties Postcode exists in the postcode feature
    table (no orphan properties) and that numeric price columns are positive."""
    properties_path, postcode_path = _split_pair(spec, "properties subset")
    failures = _failures_for_parquet(properties_path) + _failures_for_parquet(
        postcode_path
    )
    if failures:
        return failures

    try:
        postcode_set = _parquet_postcodes(postcode_path)
        property_set = _parquet_postcodes(properties_path)
    except Exception as exc:
        return [f"{properties_path} / {postcode_path}: subset check failed: {exc}"]

    orphans = property_set - postcode_set
    if orphans:
        failures.append(
            f"{properties_path}: {len(orphans):,} property postcodes are absent from "
            f"{postcode_path}; sample: {_sample(orphans)}"
        )

    # Positivity check for genuine numeric price columns only (skip nested/list
    # columns like historical_prices, which contain "price" in the name).
    try:
        schema = pl.scan_parquet(properties_path).collect_schema()
        numeric = {
            pl.Int8, pl.Int16, pl.Int32, pl.Int64,
            pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
            pl.Float32, pl.Float64,
        }
        price_cols = [
            c
            for c, dtype in schema.items()
            if ("price" in c.lower() or "rent" in c.lower()) and dtype in numeric
        ]
        for col in price_cols:
            bad = (
                pl.scan_parquet(properties_path)
                .filter(pl.col(col).is_not_null() & (pl.col(col) <= 0))
                .select(pl.len())
                .collect()
                .item()
            )
            if bad:
                failures.append(
                    f"{properties_path}: {col!r} has {bad:,} non-positive values"
                )
    except Exception as exc:
        failures.append(f"{properties_path}: price positivity check failed: {exc}")

    return failures


def _failures_for_price_index(path: Path) -> list[str]:
    """Validate price_index.parquet structural integrity: required columns, a
    finite non-null log_index, and unique (sector, type_group, year) keys.

    n_pairs == 0 is intentionally NOT treated as a failure: those rows are
    legitimate hedonic/shrinkage fallbacks for sectors with too few repeat-sale
    pairs.
    """
    failures = _failures_for_parquet(path)
    if failures:
        return failures

    try:
        names = pl.scan_parquet(path).collect_schema().names()
        required = {"sector", "type_group", "year", "log_index", "n_pairs"}
        missing = sorted(required - set(names))
        if missing:
            return [f"{path}: price index missing required columns: {missing}"]

        stats = (
            pl.scan_parquet(path)
            .select(
                pl.len().alias("n"),
                pl.col("log_index").null_count().alias("null_log"),
                (~pl.col("log_index").is_finite()).sum().alias("nonfinite_log"),
                pl.struct("sector", "type_group", "year").n_unique().alias("unique_keys"),
            )
            .collect()
            .row(0, named=True)
        )
    except Exception as exc:
        return [f"{path}: price index validation failed: {exc}"]

    if stats["null_log"]:
        failures.append(f"{path}: {stats['null_log']:,} rows have null log_index")
    if stats["nonfinite_log"]:
        failures.append(
            f"{path}: {stats['nonfinite_log']:,} rows have non-finite log_index"
        )
    if stats["unique_keys"] != stats["n"]:
        failures.append(
            f"{path}: (sector, type_group, year) is not unique "
            f"({stats['n'] - stats['unique_keys']:,} duplicate rows)"
        )

    return failures


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--file", action="append", default=[], type=Path)
    parser.add_argument("--dir", action="append", default=[], type=Path)
    parser.add_argument("--parquet", action="append", default=[], type=Path)
    parser.add_argument("--zip", action="append", default=[], type=Path)
    parser.add_argument(
        "--glob",
        action="append",
        default=[],
        help="Require at least one non-empty file matching BASE::PATTERN",
    )
    parser.add_argument(
        "--zip-glob",
        action="append",
        default=[],
        help="Require at least one readable zip matching BASE::PATTERN",
    )
    parser.add_argument(
        "--postcode-boundary-match",
        action="append",
        default=[],
        help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
    )
    parser.add_argument(
        "--active-postcode-boundary-match",
        action="append",
        default=[],
        help=(
            "Require active English ArcGIS postcodes to exactly match boundary "
            "GeoJSON postcodes: ARCGIS_PARQUET::DIR"
        ),
    )
    parser.add_argument(
        "--postcode-features",
        action="append",
        default=[],
        type=Path,
        help=(
            "Validate a postcode feature parquet: unique Postcode, non-null "
            "lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
        ),
    )
    parser.add_argument(
        "--postcode-universe",
        action="append",
        default=[],
        help=(
            "Require postcode parquet keys to equal the active-English NSPL "
            "universe: ARCGIS::POSTCODES"
        ),
    )
    parser.add_argument(
        "--properties-subset",
        action="append",
        default=[],
        help="Require properties postcodes to be a subset of postcode keys: PROPERTIES::POSTCODE",
    )
    parser.add_argument(
        "--price-index",
        action="append",
        default=[],
        type=Path,
        help="Validate price_index.parquet: finite log_index and unique (sector,type_group,year)",
    )
    args = parser.parse_args()

    failures: list[str] = []
    for path in args.file:
        failures.extend(_failures_for_file(path))
    for path in args.dir:
        failures.extend(_failures_for_dir(path))
    for path in args.parquet:
        failures.extend(_failures_for_parquet(path))
    for path in args.zip:
        failures.extend(_failures_for_zip(path))
    for spec in args.glob:
        failures.extend(_failures_for_glob(spec))
    for spec in args.zip_glob:
        failures.extend(_failures_for_zip_glob(spec))
    for spec in args.postcode_boundary_match:
        failures.extend(_failures_for_postcode_boundary_match(spec))
    for spec in args.active_postcode_boundary_match:
        failures.extend(_failures_for_active_postcode_boundary_match(spec))
    for path in args.postcode_features:
        failures.extend(_failures_for_postcode_features(path))
    for spec in args.postcode_universe:
        failures.extend(_failures_for_postcode_universe(spec))
    for spec in args.properties_subset:
        failures.extend(_failures_for_properties_subset(spec))
    for path in args.price_index:
        failures.extend(_failures_for_price_index(path))

    if failures:
        print("Output validation failed:", file=sys.stderr)
        for failure in failures:
            print(f"  - {failure}", file=sys.stderr)
        return 1

    return 0


if __name__ == "__main__":
    raise SystemExit(main())