Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/check_school_cutoffs.py
+++ b/pipeline/check_school_cutoffs.py
@ -0,0 +1,297 @@
+"""Evaluate modelled school catchment radii against published cutoffs.
+
+Local authorities publish each school's "last distance offered" in their
+yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
+holds a scraped sample of those figures (see the collection notes in each
+file's ``source_url`` fields). This script matches them to the per-school
+radii emitted by ``pipeline.transform.school_catchments --schools-output``
+and reports how well the model reproduces reality, so the preference-bonus
+constants can be calibrated.
+
+Headline metrics use non-faith schools whose published cutoff was a binding
+distance. Faith schools are reported separately (their distance criterion
+applies within faith priority, so published figures aren't comparable), as
+are "all applicants offered" schools, where the model should ideally show no
+binding cutoff.
+"""
+
+import argparse
+import difflib
+import json
+import re
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+_NOISE_WORDS = re.compile(
+    r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
+)
+_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
+_SCHOOL_WORDS = re.compile(
+    r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
+)
+
+
+def normalize_name(name: str, strip_school_words: bool = False) -> str:
+    s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
+    s = _NON_ALNUM.sub(" ", s)
+    s = _NOISE_WORDS.sub(" ", s)
+    if strip_school_words:
+        s = _SCHOOL_WORDS.sub(" ", s)
+    return " ".join(s.split())
+
+
+def normalize_la(la: str) -> str:
+    s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
+    return " ".join(s.replace("city of", "").split())
+
+
+def load_ground_truth(directory: Path) -> pl.DataFrame:
+    rows = []
+    for path in sorted(directory.glob("cutoffs_*.json")):
+        for row in json.loads(path.read_text()):
+            rows.append(
+                {
+                    "school_name": row["school_name"],
+                    "la": row["la"],
+                    "phase": row["phase"],
+                    "entry_year": int(row.get("entry_year") or 0),
+                    "cutoff_km": (
+                        float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
+                    ),
+                    "all_offered": bool(row.get("all_offered", False)),
+                    "faith_school": bool(row.get("faith_school", False)),
+                    "school_postcode": row.get("school_postcode"),
+                    "source_url": row.get("source_url", ""),
+                }
+            )
+    if not rows:
+        raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
+    df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
+    print(f"Ground truth rows: {len(df)} from {directory}")
+    return df
+
+
+def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
+    """Attach GIAS URNs to ground-truth rows by postcode, then name."""
+    def stripped(name: str) -> str:
+        return normalize_name(name, strip_school_words=True)
+
+    gias = gias.with_columns(
+        pl.col("name")
+        .map_elements(normalize_name, return_dtype=pl.Utf8)
+        .alias("_name_norm"),
+        pl.col("name")
+        .map_elements(stripped, return_dtype=pl.Utf8)
+        .alias("_name_stripped"),
+        pl.col("local_authority")
+        .map_elements(normalize_la, return_dtype=pl.Utf8)
+        .alias("_la_norm"),
+        pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
+    )
+    truth = truth.with_columns(
+        pl.col("school_name")
+        .map_elements(normalize_name, return_dtype=pl.Utf8)
+        .alias("_name_norm"),
+        pl.col("school_name")
+        .map_elements(stripped, return_dtype=pl.Utf8)
+        .alias("_name_stripped"),
+        pl.col("la")
+        .map_elements(normalize_la, return_dtype=pl.Utf8)
+        .alias("_la_norm"),
+        pl.col("school_postcode")
+        .str.replace_all(" ", "")
+        .str.to_uppercase()
+        .alias("_pc"),
+    ).with_row_index("_row_id")
+
+    # 1. Exact postcode match (unique postcodes only — site-sharing schools
+    #    would mismatch phases otherwise; those fall through to name matching).
+    pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
+        subset="_pc", keep="none"
+    )
+    by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
+        pc_unique.select("_pc", "urn"), on="_pc", how="inner"
+    )
+    matched_ids = set(by_pc["_row_id"].to_list())
+
+    # 2. Exact normalized (name, LA) match, unique on both sides.
+    gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
+    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
+    by_name = remaining.join(
+        gias_named.select("_name_norm", "_la_norm", "urn"),
+        on=["_name_norm", "_la_norm"],
+        how="inner",
+    )
+    matched_ids |= set(by_name["_row_id"].to_list())
+
+    # 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
+    #    Primary School"): match on names with school-type words stripped,
+    #    unique on both sides so site-sharing infant/junior pairs fall through.
+    gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
+        subset=["_name_stripped", "_la_norm"], keep="none"
+    )
+    remaining = truth.filter(
+        (~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
+    ).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
+    by_stripped = remaining.join(
+        gias_stripped.select("_name_stripped", "_la_norm", "urn"),
+        on=["_name_stripped", "_la_norm"],
+        how="inner",
+    )
+    matched_ids |= set(by_stripped["_row_id"].to_list())
+
+    # 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
+    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
+    fuzzy_rows = []
+    gias_by_la: dict[str, pl.DataFrame] = {}
+    for row in remaining.iter_rows(named=True):
+        la = row["_la_norm"]
+        if la not in gias_by_la:
+            gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
+        candidates = gias_by_la[la]
+        if candidates.is_empty():
+            continue
+        scores = [
+            difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
+            for cand in candidates["_name_norm"].to_list()
+        ]
+        order = np.argsort(scores)[::-1]
+        if scores[order[0]] >= 0.87 and (
+            len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
+        ):
+            fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
+    by_fuzzy = (
+        pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
+        if fuzzy_rows
+        else None
+    )
+
+    parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
+    matched = pl.concat(
+        [p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
+    ).unique(subset="_row_id", keep="first")
+    print(
+        f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
+        f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
+        f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
+    )
+    return matched
+
+
+def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
+    joined = matched.join(radii, on=["urn", "phase"], how="inner")
+    print(f"Joined to modelled radii: {len(joined)} rows")
+
+    # Published figures occasionally include non-typical admits (a child who
+    # moved mid-process can print as hundreds of km); cap at distances a
+    # distance criterion can plausibly produce.
+    binding = joined.filter(
+        ~pl.col("all_offered")
+        & pl.col("cutoff_km").is_between(0.05, 20.0)
+    )
+
+    def report(df: pl.DataFrame, label: str) -> None:
+        if df.is_empty():
+            print(f"\n{label}: no rows")
+            return
+        truth_km = df["cutoff_km"].to_numpy()
+        model_km = df["radius_km"].to_numpy()
+        log_ratio = np.log2(model_km / truth_km)
+        within2 = float(np.mean(np.abs(log_ratio) <= 1))
+        rank = (
+            pl.DataFrame({"t": truth_km, "m": model_km})
+            .select(pl.corr("t", "m", method="spearman"))
+            .item()
+        )
+        print(
+            f"\n{label} (n={len(df)}):\n"
+            f"  median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
+            f"(x{2 ** np.median(log_ratio):.2f})\n"
+            f"  median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
+            f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
+            f"  within factor 2: {within2:.0%}\n"
+            f"  Spearman rank corr: {rank:.2f}"
+        )
+
+    for phase in ("primary", "secondary"):
+        report(
+            binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
+            f"BINDING, non-faith, {phase}",
+        )
+    report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
+
+    offered = joined.filter(pl.col("all_offered"))
+    if not offered.is_empty():
+        unbound_share = float((~offered["filled"]).mean())
+        print(
+            f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
+            f"cutoff for {unbound_share:.0%}; median modelled radius "
+            f"{offered['radius_km'].median():.2f} km"
+        )
+    return binding
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Compare modelled catchment radii with published cutoffs"
+    )
+    parser.add_argument(
+        "--ground-truth-dir",
+        type=Path,
+        default=Path("property-data/ground_truth"),
+    )
+    parser.add_argument(
+        "--radii",
+        type=Path,
+        default=Path("property-data/school_catchment_radii.parquet"),
+        help="Per-school radii parquet from school_catchments --schools-output",
+    )
+    parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
+    parser.add_argument(
+        "--matched-out",
+        type=Path,
+        default=None,
+        help="Optional CSV of matched rows for inspection",
+    )
+    args = parser.parse_args()
+
+    truth = load_ground_truth(args.ground_truth_dir)
+    # One row per school+phase: keep the most recent entry year.
+    truth = (
+        truth.sort("entry_year", descending=True)
+        .unique(subset=["school_name", "la", "phase"], keep="first")
+    )
+    gias = pl.read_parquet(args.gias).select(
+        "urn", "name", "postcode", "local_authority", "religious_character"
+    )
+    radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
+
+    matched = match_schools(truth, gias.drop("religious_character"))
+    # GIAS religious character is authoritative; the scraped name-based flag
+    # only covers rows that failed to match.
+    matched = matched.join(
+        gias.select("urn", "religious_character"), on="urn", how="left"
+    ).with_columns(
+        pl.when(pl.col("religious_character").is_not_null())
+        .then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
+        .otherwise(pl.col("faith_school"))
+        .alias("faith_school")
+    )
+    binding = evaluate(matched, radii)
+
+    if args.matched_out is not None:
+        out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
+            "_row_id", "_name_norm", "_la_norm", "_pc"
+        )
+        args.matched_out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_csv(args.matched_out)
+        print(f"\nWrote matched rows to {args.matched_out}")
+
+    if binding.is_empty():
+        raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
+
+
+if __name__ == "__main__":
+    main()