perfect-postcode/pipeline/check_school_cutoffs.py

"""Evaluate modelled school catchment radii against published cutoffs.

Local authorities publish each school's "last distance offered" in their
yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
holds a scraped sample of those figures (see the collection notes in each
file's ``source_url`` fields). This script matches them to the per-school
radii emitted by ``pipeline.transform.school_catchments --schools-output``
and reports how well the model reproduces reality, so the preference-bonus
constants can be calibrated.

Headline metrics use non-faith schools whose published cutoff was a binding
distance. Faith schools are reported separately (their distance criterion
applies within faith priority, so published figures aren't comparable), as
are "all applicants offered" schools, where the model should ideally show no
binding cutoff.
"""

import argparse
import difflib
import json
import re
from pathlib import Path

import numpy as np
import polars as pl

from pipeline.utils.normalize import collapse_whitespace, replace_non_alnum_lower

_NOISE_WORDS = re.compile(
    r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
)
_SCHOOL_WORDS = re.compile(
    r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
)


def normalize_name(name: str, strip_school_words: bool = False) -> str:
    s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
    s = replace_non_alnum_lower(s)
    s = _NOISE_WORDS.sub(" ", s)
    if strip_school_words:
        s = _SCHOOL_WORDS.sub(" ", s)
    return collapse_whitespace(s)


def normalize_la(la: str) -> str:
    s = replace_non_alnum_lower(la.lower().replace("&", " and "))
    return collapse_whitespace(s.replace("city of", ""))


def load_ground_truth(directory: Path) -> pl.DataFrame:
    rows = []
    for path in sorted(directory.glob("cutoffs_*.json")):
        for row in json.loads(path.read_text()):
            rows.append(
                {
                    "school_name": row["school_name"],
                    "la": row["la"],
                    "phase": row["phase"],
                    "entry_year": int(row.get("entry_year") or 0),
                    "cutoff_km": (
                        float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
                    ),
                    "all_offered": bool(row.get("all_offered", False)),
                    "faith_school": bool(row.get("faith_school", False)),
                    "school_postcode": row.get("school_postcode"),
                    "source_url": row.get("source_url", ""),
                }
            )
    if not rows:
        raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
    df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
    print(f"Ground truth rows: {len(df)} from {directory}")
    return df


def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
    """Attach GIAS URNs to ground-truth rows by postcode, then name."""
    def stripped(name: str) -> str:
        return normalize_name(name, strip_school_words=True)

    gias = gias.with_columns(
        pl.col("name")
        .map_elements(normalize_name, return_dtype=pl.Utf8)
        .alias("_name_norm"),
        pl.col("name")
        .map_elements(stripped, return_dtype=pl.Utf8)
        .alias("_name_stripped"),
        pl.col("local_authority")
        .map_elements(normalize_la, return_dtype=pl.Utf8)
        .alias("_la_norm"),
        pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
    )
    truth = truth.with_columns(
        pl.col("school_name")
        .map_elements(normalize_name, return_dtype=pl.Utf8)
        .alias("_name_norm"),
        pl.col("school_name")
        .map_elements(stripped, return_dtype=pl.Utf8)
        .alias("_name_stripped"),
        pl.col("la")
        .map_elements(normalize_la, return_dtype=pl.Utf8)
        .alias("_la_norm"),
        pl.col("school_postcode")
        .str.replace_all(" ", "")
        .str.to_uppercase()
        .alias("_pc"),
    ).with_row_index("_row_id")

    # 1. Exact postcode match (unique postcodes only — site-sharing schools
    #    would mismatch phases otherwise; those fall through to name matching).
    pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
        subset="_pc", keep="none"
    )
    by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
        pc_unique.select("_pc", "urn"), on="_pc", how="inner"
    )
    matched_ids = set(by_pc["_row_id"].to_list())

    # 2. Exact normalized (name, LA) match, unique on both sides.
    gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
    by_name = remaining.join(
        gias_named.select("_name_norm", "_la_norm", "urn"),
        on=["_name_norm", "_la_norm"],
        how="inner",
    )
    matched_ids |= set(by_name["_row_id"].to_list())

    # 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
    #    Primary School"): match on names with school-type words stripped,
    #    unique on both sides so site-sharing infant/junior pairs fall through.
    gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
        subset=["_name_stripped", "_la_norm"], keep="none"
    )
    remaining = truth.filter(
        (~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
    ).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
    by_stripped = remaining.join(
        gias_stripped.select("_name_stripped", "_la_norm", "urn"),
        on=["_name_stripped", "_la_norm"],
        how="inner",
    )
    matched_ids |= set(by_stripped["_row_id"].to_list())

    # 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
    fuzzy_rows = []
    gias_by_la: dict[str, pl.DataFrame] = {}
    for row in remaining.iter_rows(named=True):
        la = row["_la_norm"]
        if la not in gias_by_la:
            gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
        candidates = gias_by_la[la]
        if candidates.is_empty():
            continue
        scores = [
            difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
            for cand in candidates["_name_norm"].to_list()
        ]
        order = np.argsort(scores)[::-1]
        if scores[order[0]] >= 0.87 and (
            len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
        ):
            fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
    by_fuzzy = (
        pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
        if fuzzy_rows
        else None
    )

    parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
    matched = pl.concat(
        [p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
    ).unique(subset="_row_id", keep="first")
    print(
        f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
        f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
        f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
    )
    return matched


def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
    joined = matched.join(radii, on=["urn", "phase"], how="inner")
    print(f"Joined to modelled radii: {len(joined)} rows")

    # Published figures occasionally include non-typical admits (a child who
    # moved mid-process can print as hundreds of km); cap at distances a
    # distance criterion can plausibly produce.
    binding = joined.filter(
        ~pl.col("all_offered")
        & pl.col("cutoff_km").is_between(0.05, 20.0)
    )

    def report(df: pl.DataFrame, label: str) -> None:
        if df.is_empty():
            print(f"\n{label}: no rows")
            return
        truth_km = df["cutoff_km"].to_numpy()
        model_km = df["radius_km"].to_numpy()
        log_ratio = np.log2(model_km / truth_km)
        within2 = float(np.mean(np.abs(log_ratio) <= 1))
        rank = (
            pl.DataFrame({"t": truth_km, "m": model_km})
            .select(pl.corr("t", "m", method="spearman"))
            .item()
        )
        print(
            f"\n{label} (n={len(df)}):\n"
            f"  median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
            f"(x{2 ** np.median(log_ratio):.2f})\n"
            f"  median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
            f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
            f"  within factor 2: {within2:.0%}\n"
            f"  Spearman rank corr: {rank:.2f}"
        )

    for phase in ("primary", "secondary"):
        report(
            binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
            f"BINDING, non-faith, {phase}",
        )
    report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")

    offered = joined.filter(pl.col("all_offered"))
    if not offered.is_empty():
        unbound_share = float((~offered["filled"]).mean())
        print(
            f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
            f"cutoff for {unbound_share:.0%}; median modelled radius "
            f"{offered['radius_km'].median():.2f} km"
        )
    return binding


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Compare modelled catchment radii with published cutoffs"
    )
    parser.add_argument(
        "--ground-truth-dir",
        type=Path,
        default=Path("property-data/ground_truth"),
    )
    parser.add_argument(
        "--radii",
        type=Path,
        default=Path("property-data/school_catchment_radii.parquet"),
        help="Per-school radii parquet from school_catchments --schools-output",
    )
    parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
    parser.add_argument(
        "--matched-out",
        type=Path,
        default=None,
        help="Optional CSV of matched rows for inspection",
    )
    args = parser.parse_args()

    truth = load_ground_truth(args.ground_truth_dir)
    # One row per school+phase: keep the most recent entry year.
    truth = (
        truth.sort("entry_year", descending=True)
        .unique(subset=["school_name", "la", "phase"], keep="first")
    )
    gias = pl.read_parquet(args.gias).select(
        "urn", "name", "postcode", "local_authority", "religious_character"
    )
    radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")

    matched = match_schools(truth, gias.drop("religious_character"))
    # GIAS religious character is authoritative; the scraped name-based flag
    # only covers rows that failed to match.
    matched = matched.join(
        gias.select("urn", "religious_character"), on="urn", how="left"
    ).with_columns(
        pl.when(pl.col("religious_character").is_not_null())
        .then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
        .otherwise(pl.col("faith_school"))
        .alias("faith_school")
    )
    binding = evaluate(matched, radii)

    if args.matched_out is not None:
        out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
            "_row_id", "_name_norm", "_la_norm", "_pc"
        )
        args.matched_out.parent.mkdir(parents=True, exist_ok=True)
        out.write_csv(args.matched_out)
        print(f"\nWrote matched rows to {args.matched_out}")

    if binding.is_empty():
        raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")


if __name__ == "__main__":
    main()