"""Evaluate modelled school catchment radii against published cutoffs. Local authorities publish each school's "last distance offered" in their yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json`` holds a scraped sample of those figures (see the collection notes in each file's ``source_url`` fields). This script matches them to the per-school radii emitted by ``pipeline.transform.school_catchments --schools-output`` and reports how well the model reproduces reality, so the preference-bonus constants can be calibrated. Headline metrics use non-faith schools whose published cutoff was a binding distance. Faith schools are reported separately (their distance criterion applies within faith priority, so published figures aren't comparable), as are "all applicants offered" schools, where the model should ideally show no binding cutoff. """ import argparse import difflib import json import re from pathlib import Path import numpy as np import polars as pl _NOISE_WORDS = re.compile( r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b" ) _NON_ALNUM = re.compile(r"[^a-z0-9 ]") _SCHOOL_WORDS = re.compile( r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b" ) def normalize_name(name: str, strip_school_words: bool = False) -> str: s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "") s = _NON_ALNUM.sub(" ", s) s = _NOISE_WORDS.sub(" ", s) if strip_school_words: s = _SCHOOL_WORDS.sub(" ", s) return " ".join(s.split()) def normalize_la(la: str) -> str: s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and ")) return " ".join(s.replace("city of", "").split()) def load_ground_truth(directory: Path) -> pl.DataFrame: rows = [] for path in sorted(directory.glob("cutoffs_*.json")): for row in json.loads(path.read_text()): rows.append( { "school_name": row["school_name"], "la": row["la"], "phase": row["phase"], "entry_year": int(row.get("entry_year") or 0), "cutoff_km": ( float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None ), "all_offered": bool(row.get("all_offered", False)), "faith_school": bool(row.get("faith_school", False)), "school_postcode": row.get("school_postcode"), "source_url": row.get("source_url", ""), } ) if not rows: raise SystemExit(f"No cutoffs_*.json files with rows under {directory}") df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8}) print(f"Ground truth rows: {len(df)} from {directory}") return df def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame: """Attach GIAS URNs to ground-truth rows by postcode, then name.""" def stripped(name: str) -> str: return normalize_name(name, strip_school_words=True) gias = gias.with_columns( pl.col("name") .map_elements(normalize_name, return_dtype=pl.Utf8) .alias("_name_norm"), pl.col("name") .map_elements(stripped, return_dtype=pl.Utf8) .alias("_name_stripped"), pl.col("local_authority") .map_elements(normalize_la, return_dtype=pl.Utf8) .alias("_la_norm"), pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"), ) truth = truth.with_columns( pl.col("school_name") .map_elements(normalize_name, return_dtype=pl.Utf8) .alias("_name_norm"), pl.col("school_name") .map_elements(stripped, return_dtype=pl.Utf8) .alias("_name_stripped"), pl.col("la") .map_elements(normalize_la, return_dtype=pl.Utf8) .alias("_la_norm"), pl.col("school_postcode") .str.replace_all(" ", "") .str.to_uppercase() .alias("_pc"), ).with_row_index("_row_id") # 1. Exact postcode match (unique postcodes only — site-sharing schools # would mismatch phases otherwise; those fall through to name matching). pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique( subset="_pc", keep="none" ) by_pc = truth.filter(pl.col("_pc").is_not_null()).join( pc_unique.select("_pc", "urn"), on="_pc", how="inner" ) matched_ids = set(by_pc["_row_id"].to_list()) # 2. Exact normalized (name, LA) match, unique on both sides. gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none") remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids))) by_name = remaining.join( gias_named.select("_name_norm", "_la_norm", "urn"), on=["_name_norm", "_la_norm"], how="inner", ) matched_ids |= set(by_name["_row_id"].to_list()) # 3. Reports often print informal names ("Ashmole Primary" for "Ashmole # Primary School"): match on names with school-type words stripped, # unique on both sides so site-sharing infant/junior pairs fall through. gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique( subset=["_name_stripped", "_la_norm"], keep="none" ) remaining = truth.filter( (~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "") ).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none") by_stripped = remaining.join( gias_stripped.select("_name_stripped", "_la_norm", "urn"), on=["_name_stripped", "_la_norm"], how="inner", ) matched_ids |= set(by_stripped["_row_id"].to_list()) # 4. Fuzzy name match within the LA: unique best candidate >= 0.87. remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids))) fuzzy_rows = [] gias_by_la: dict[str, pl.DataFrame] = {} for row in remaining.iter_rows(named=True): la = row["_la_norm"] if la not in gias_by_la: gias_by_la[la] = gias.filter(pl.col("_la_norm") == la) candidates = gias_by_la[la] if candidates.is_empty(): continue scores = [ difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio() for cand in candidates["_name_norm"].to_list() ] order = np.argsort(scores)[::-1] if scores[order[0]] >= 0.87 and ( len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04 ): fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]}) by_fuzzy = ( pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32)) if fuzzy_rows else None ) parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else []) matched = pl.concat( [p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()] ).unique(subset="_row_id", keep="first") print( f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs " f"(postcode {len(by_pc)}, exact name {len(by_name)}, " f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})" ) return matched def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame: joined = matched.join(radii, on=["urn", "phase"], how="inner") print(f"Joined to modelled radii: {len(joined)} rows") # Published figures occasionally include non-typical admits (a child who # moved mid-process can print as hundreds of km); cap at distances a # distance criterion can plausibly produce. binding = joined.filter( ~pl.col("all_offered") & pl.col("cutoff_km").is_between(0.05, 20.0) ) def report(df: pl.DataFrame, label: str) -> None: if df.is_empty(): print(f"\n{label}: no rows") return truth_km = df["cutoff_km"].to_numpy() model_km = df["radius_km"].to_numpy() log_ratio = np.log2(model_km / truth_km) within2 = float(np.mean(np.abs(log_ratio) <= 1)) rank = ( pl.DataFrame({"t": truth_km, "m": model_km}) .select(pl.corr("t", "m", method="spearman")) .item() ) print( f"\n{label} (n={len(df)}):\n" f" median bias (log2 model/truth): {np.median(log_ratio):+.2f} " f"(x{2 ** np.median(log_ratio):.2f})\n" f" median |log2 error|: {np.median(np.abs(log_ratio)):.2f} " f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n" f" within factor 2: {within2:.0%}\n" f" Spearman rank corr: {rank:.2f}" ) for phase in ("primary", "secondary"): report( binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")), f"BINDING, non-faith, {phase}", ) report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)") offered = joined.filter(pl.col("all_offered")) if not offered.is_empty(): unbound_share = float((~offered["filled"]).mean()) print( f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding " f"cutoff for {unbound_share:.0%}; median modelled radius " f"{offered['radius_km'].median():.2f} km" ) return binding def main() -> None: parser = argparse.ArgumentParser( description="Compare modelled catchment radii with published cutoffs" ) parser.add_argument( "--ground-truth-dir", type=Path, default=Path("property-data/ground_truth"), ) parser.add_argument( "--radii", type=Path, default=Path("property-data/school_catchment_radii.parquet"), help="Per-school radii parquet from school_catchments --schools-output", ) parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet")) parser.add_argument( "--matched-out", type=Path, default=None, help="Optional CSV of matched rows for inspection", ) args = parser.parse_args() truth = load_ground_truth(args.ground_truth_dir) # One row per school+phase: keep the most recent entry year. truth = ( truth.sort("entry_year", descending=True) .unique(subset=["school_name", "la", "phase"], keep="first") ) gias = pl.read_parquet(args.gias).select( "urn", "name", "postcode", "local_authority", "religious_character" ) radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first") matched = match_schools(truth, gias.drop("religious_character")) # GIAS religious character is authoritative; the scraped name-based flag # only covers rows that failed to match. matched = matched.join( gias.select("urn", "religious_character"), on="urn", how="left" ).with_columns( pl.when(pl.col("religious_character").is_not_null()) .then(~pl.col("religious_character").is_in(["None", "Does not apply"])) .otherwise(pl.col("faith_school")) .alias("faith_school") ) binding = evaluate(matched, radii) if args.matched_out is not None: out = matched.join(radii, on=["urn", "phase"], how="inner").drop( "_row_id", "_name_norm", "_la_norm", "_pc" ) args.matched_out.parent.mkdir(parents=True, exist_ok=True) out.write_csv(args.matched_out) print(f"\nWrote matched rows to {args.matched_out}") if binding.is_empty(): raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on") if __name__ == "__main__": main()