Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/validate_outputs.py
+++ b/pipeline/validate_outputs.py
@ -352,6 +352,176 @@ def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
    return failures


+def _failures_for_postcode_features(path: Path) -> list[str]:
+    """Validate the postcode feature output: unique Postcode, non-null lat/lon
+    inside the England bbox, ctry25cd == E92000001, and every '% ' column in
+    [0, 100]. Mirrors the in-build invariant (merge._validate_postcode_feature_output)
+    so a stale/contaminated file on disk cannot pass `make`.
+    """
+    failures = _failures_for_parquet(path)
+    if failures:
+        return failures
+
+    try:
+        names = pl.scan_parquet(path).collect_schema().names()
+        required = {"Postcode", "lat", "lon", "ctry25cd"}
+        missing = sorted(required - set(names))
+        if missing:
+            return [f"{path}: postcode features missing required columns: {missing}"]
+
+        pct_cols = [c for c in names if c.startswith("% ")]
+        df = (
+            pl.scan_parquet(path)
+            .select(["Postcode", "lat", "lon", "ctry25cd", *pct_cols])
+            .collect()
+        )
+    except Exception as exc:
+        return [f"{path}: postcode features validation failed: {exc}"]
+
+    height = df.height
+    if df["Postcode"].n_unique() != height:
+        failures.append(
+            f"{path}: Postcode is not unique "
+            f"({height - df['Postcode'].n_unique():,} duplicate rows)"
+        )
+
+    # England bounding box (generous): lat 49.5-60N, lon -8 to 2.5E.
+    bad_coords = df.filter(
+        pl.col("lat").is_null()
+        | pl.col("lon").is_null()
+        | ~pl.col("lat").is_between(49.5, 60.0)
+        | ~pl.col("lon").is_between(-8.0, 2.5)
+    )
+    if bad_coords.height:
+        sample = bad_coords.get_column("Postcode").head(10).to_list()
+        failures.append(
+            f"{path}: {bad_coords.height:,} rows have null or out-of-England "
+            f"lat/lon; sample: {_format_samples(sample)}"
+        )
+
+    bad_country = df.filter(pl.col("ctry25cd") != "E92000001")
+    if bad_country.height:
+        sample = bad_country.get_column("Postcode").head(10).to_list()
+        failures.append(
+            f"{path}: {bad_country.height:,} rows have ctry25cd != 'E92000001' "
+            f"(non-England contamination); sample: {_format_samples(sample)}"
+        )
+
+    for col in pct_cols:
+        out_of_range = df.filter(
+            pl.col(col).is_not_null() & ~pl.col(col).is_between(0.0, 100.0)
+        ).height
+        if out_of_range:
+            failures.append(
+                f"{path}: {col!r} has {out_of_range:,} values outside [0, 100]"
+            )
+
+    return failures
+
+
+def _failures_for_properties_subset(spec: str) -> list[str]:
+    """Validate that every properties Postcode exists in the postcode feature
+    table (no orphan properties) and that numeric price columns are positive."""
+    properties_path, postcode_path = _split_pair(spec, "properties subset")
+    failures = _failures_for_parquet(properties_path) + _failures_for_parquet(
+        postcode_path
+    )
+    if failures:
+        return failures
+
+    try:
+        postcode_set = _parquet_postcodes(postcode_path)
+        property_set = _parquet_postcodes(properties_path)
+    except Exception as exc:
+        return [f"{properties_path} / {postcode_path}: subset check failed: {exc}"]
+
+    orphans = property_set - postcode_set
+    if orphans:
+        failures.append(
+            f"{properties_path}: {len(orphans):,} property postcodes are absent from "
+            f"{postcode_path}; sample: {_sample(orphans)}"
+        )
+
+    # Positivity check for genuine numeric price columns only (skip nested/list
+    # columns like historical_prices, which contain "price" in the name).
+    try:
+        schema = pl.scan_parquet(properties_path).collect_schema()
+        numeric = {
+            pl.Int8, pl.Int16, pl.Int32, pl.Int64,
+            pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
+            pl.Float32, pl.Float64,
+        }
+        price_cols = [
+            c
+            for c, dtype in schema.items()
+            if ("price" in c.lower() or "rent" in c.lower()) and dtype in numeric
+        ]
+        for col in price_cols:
+            bad = (
+                pl.scan_parquet(properties_path)
+                .filter(pl.col(col).is_not_null() & (pl.col(col) <= 0))
+                .select(pl.len())
+                .collect()
+                .item()
+            )
+            if bad:
+                failures.append(
+                    f"{properties_path}: {col!r} has {bad:,} non-positive values"
+                )
+    except Exception as exc:
+        failures.append(f"{properties_path}: price positivity check failed: {exc}")
+
+    return failures
+
+
+def _failures_for_price_index(path: Path) -> list[str]:
+    """Validate price_index.parquet structural integrity: required columns, a
+    finite non-null log_index, and unique (sector, type_group, year) keys.
+
+    n_pairs == 0 is intentionally NOT treated as a failure: those rows are
+    legitimate hedonic/shrinkage fallbacks for sectors with too few repeat-sale
+    pairs.
+    """
+    failures = _failures_for_parquet(path)
+    if failures:
+        return failures
+
+    try:
+        names = pl.scan_parquet(path).collect_schema().names()
+        required = {"sector", "type_group", "year", "log_index", "n_pairs"}
+        missing = sorted(required - set(names))
+        if missing:
+            return [f"{path}: price index missing required columns: {missing}"]
+
+        stats = (
+            pl.scan_parquet(path)
+            .select(
+                pl.len().alias("n"),
+                pl.col("log_index").null_count().alias("null_log"),
+                (~pl.col("log_index").is_finite()).sum().alias("nonfinite_log"),
+                pl.struct("sector", "type_group", "year").n_unique().alias("unique_keys"),
+            )
+            .collect()
+            .row(0, named=True)
+        )
+    except Exception as exc:
+        return [f"{path}: price index validation failed: {exc}"]
+
+    if stats["null_log"]:
+        failures.append(f"{path}: {stats['null_log']:,} rows have null log_index")
+    if stats["nonfinite_log"]:
+        failures.append(
+            f"{path}: {stats['nonfinite_log']:,} rows have non-finite log_index"
+        )
+    if stats["unique_keys"] != stats["n"]:
+        failures.append(
+            f"{path}: (sector, type_group, year) is not unique "
+            f"({stats['n'] - stats['unique_keys']:,} duplicate rows)"
+        )
+
+    return failures
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--file", action="append", default=[], type=Path)
@ -385,6 +555,29 @@ def main() -> int:
            "GeoJSON postcodes: ARCGIS_PARQUET::DIR"
        ),
    )
+    parser.add_argument(
+        "--postcode-features",
+        action="append",
+        default=[],
+        type=Path,
+        help=(
+            "Validate a postcode feature parquet: unique Postcode, non-null "
+            "lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
+        ),
+    )
+    parser.add_argument(
+        "--properties-subset",
+        action="append",
+        default=[],
+        help="Require properties postcodes to be a subset of postcode keys: PROPERTIES::POSTCODE",
+    )
+    parser.add_argument(
+        "--price-index",
+        action="append",
+        default=[],
+        type=Path,
+        help="Validate price_index.parquet: finite log_index and unique (sector,type_group,year)",
+    )
    args = parser.parse_args()

    failures: list[str] = []
@ -404,6 +597,12 @@ def main() -> int:
        failures.extend(_failures_for_postcode_boundary_match(spec))
    for spec in args.active_postcode_boundary_match:
        failures.extend(_failures_for_active_postcode_boundary_match(spec))
+    for path in args.postcode_features:
+        failures.extend(_failures_for_postcode_features(path))
+    for spec in args.properties_subset:
+        failures.extend(_failures_for_properties_subset(spec))
+    for path in args.price_index:
+        failures.extend(_failures_for_price_index(path))

    if failures:
        print("Output validation failed:", file=sys.stderr)