297 lines
12 KiB
Python
297 lines
12 KiB
Python
"""Evaluate modelled school catchment radii against published cutoffs.
|
|
|
|
Local authorities publish each school's "last distance offered" in their
|
|
yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
|
|
holds a scraped sample of those figures (see the collection notes in each
|
|
file's ``source_url`` fields). This script matches them to the per-school
|
|
radii emitted by ``pipeline.transform.school_catchments --schools-output``
|
|
and reports how well the model reproduces reality, so the preference-bonus
|
|
constants can be calibrated.
|
|
|
|
Headline metrics use non-faith schools whose published cutoff was a binding
|
|
distance. Faith schools are reported separately (their distance criterion
|
|
applies within faith priority, so published figures aren't comparable), as
|
|
are "all applicants offered" schools, where the model should ideally show no
|
|
binding cutoff.
|
|
"""
|
|
|
|
import argparse
|
|
import difflib
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
|
|
_NOISE_WORDS = re.compile(
|
|
r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
|
|
)
|
|
_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
|
|
_SCHOOL_WORDS = re.compile(
|
|
r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
|
|
)
|
|
|
|
|
|
def normalize_name(name: str, strip_school_words: bool = False) -> str:
|
|
s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
|
|
s = _NON_ALNUM.sub(" ", s)
|
|
s = _NOISE_WORDS.sub(" ", s)
|
|
if strip_school_words:
|
|
s = _SCHOOL_WORDS.sub(" ", s)
|
|
return " ".join(s.split())
|
|
|
|
|
|
def normalize_la(la: str) -> str:
|
|
s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
|
|
return " ".join(s.replace("city of", "").split())
|
|
|
|
|
|
def load_ground_truth(directory: Path) -> pl.DataFrame:
|
|
rows = []
|
|
for path in sorted(directory.glob("cutoffs_*.json")):
|
|
for row in json.loads(path.read_text()):
|
|
rows.append(
|
|
{
|
|
"school_name": row["school_name"],
|
|
"la": row["la"],
|
|
"phase": row["phase"],
|
|
"entry_year": int(row.get("entry_year") or 0),
|
|
"cutoff_km": (
|
|
float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
|
|
),
|
|
"all_offered": bool(row.get("all_offered", False)),
|
|
"faith_school": bool(row.get("faith_school", False)),
|
|
"school_postcode": row.get("school_postcode"),
|
|
"source_url": row.get("source_url", ""),
|
|
}
|
|
)
|
|
if not rows:
|
|
raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
|
|
df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
|
|
print(f"Ground truth rows: {len(df)} from {directory}")
|
|
return df
|
|
|
|
|
|
def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
|
|
"""Attach GIAS URNs to ground-truth rows by postcode, then name."""
|
|
def stripped(name: str) -> str:
|
|
return normalize_name(name, strip_school_words=True)
|
|
|
|
gias = gias.with_columns(
|
|
pl.col("name")
|
|
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
|
.alias("_name_norm"),
|
|
pl.col("name")
|
|
.map_elements(stripped, return_dtype=pl.Utf8)
|
|
.alias("_name_stripped"),
|
|
pl.col("local_authority")
|
|
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
|
.alias("_la_norm"),
|
|
pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
|
|
)
|
|
truth = truth.with_columns(
|
|
pl.col("school_name")
|
|
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
|
.alias("_name_norm"),
|
|
pl.col("school_name")
|
|
.map_elements(stripped, return_dtype=pl.Utf8)
|
|
.alias("_name_stripped"),
|
|
pl.col("la")
|
|
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
|
.alias("_la_norm"),
|
|
pl.col("school_postcode")
|
|
.str.replace_all(" ", "")
|
|
.str.to_uppercase()
|
|
.alias("_pc"),
|
|
).with_row_index("_row_id")
|
|
|
|
# 1. Exact postcode match (unique postcodes only — site-sharing schools
|
|
# would mismatch phases otherwise; those fall through to name matching).
|
|
pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
|
|
subset="_pc", keep="none"
|
|
)
|
|
by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
|
|
pc_unique.select("_pc", "urn"), on="_pc", how="inner"
|
|
)
|
|
matched_ids = set(by_pc["_row_id"].to_list())
|
|
|
|
# 2. Exact normalized (name, LA) match, unique on both sides.
|
|
gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
|
|
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
|
by_name = remaining.join(
|
|
gias_named.select("_name_norm", "_la_norm", "urn"),
|
|
on=["_name_norm", "_la_norm"],
|
|
how="inner",
|
|
)
|
|
matched_ids |= set(by_name["_row_id"].to_list())
|
|
|
|
# 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
|
|
# Primary School"): match on names with school-type words stripped,
|
|
# unique on both sides so site-sharing infant/junior pairs fall through.
|
|
gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
|
|
subset=["_name_stripped", "_la_norm"], keep="none"
|
|
)
|
|
remaining = truth.filter(
|
|
(~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
|
|
).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
|
|
by_stripped = remaining.join(
|
|
gias_stripped.select("_name_stripped", "_la_norm", "urn"),
|
|
on=["_name_stripped", "_la_norm"],
|
|
how="inner",
|
|
)
|
|
matched_ids |= set(by_stripped["_row_id"].to_list())
|
|
|
|
# 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
|
|
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
|
fuzzy_rows = []
|
|
gias_by_la: dict[str, pl.DataFrame] = {}
|
|
for row in remaining.iter_rows(named=True):
|
|
la = row["_la_norm"]
|
|
if la not in gias_by_la:
|
|
gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
|
|
candidates = gias_by_la[la]
|
|
if candidates.is_empty():
|
|
continue
|
|
scores = [
|
|
difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
|
|
for cand in candidates["_name_norm"].to_list()
|
|
]
|
|
order = np.argsort(scores)[::-1]
|
|
if scores[order[0]] >= 0.87 and (
|
|
len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
|
|
):
|
|
fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
|
|
by_fuzzy = (
|
|
pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
|
|
if fuzzy_rows
|
|
else None
|
|
)
|
|
|
|
parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
|
|
matched = pl.concat(
|
|
[p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
|
|
).unique(subset="_row_id", keep="first")
|
|
print(
|
|
f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
|
|
f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
|
|
f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
|
|
)
|
|
return matched
|
|
|
|
|
|
def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
|
|
joined = matched.join(radii, on=["urn", "phase"], how="inner")
|
|
print(f"Joined to modelled radii: {len(joined)} rows")
|
|
|
|
# Published figures occasionally include non-typical admits (a child who
|
|
# moved mid-process can print as hundreds of km); cap at distances a
|
|
# distance criterion can plausibly produce.
|
|
binding = joined.filter(
|
|
~pl.col("all_offered")
|
|
& pl.col("cutoff_km").is_between(0.05, 20.0)
|
|
)
|
|
|
|
def report(df: pl.DataFrame, label: str) -> None:
|
|
if df.is_empty():
|
|
print(f"\n{label}: no rows")
|
|
return
|
|
truth_km = df["cutoff_km"].to_numpy()
|
|
model_km = df["radius_km"].to_numpy()
|
|
log_ratio = np.log2(model_km / truth_km)
|
|
within2 = float(np.mean(np.abs(log_ratio) <= 1))
|
|
rank = (
|
|
pl.DataFrame({"t": truth_km, "m": model_km})
|
|
.select(pl.corr("t", "m", method="spearman"))
|
|
.item()
|
|
)
|
|
print(
|
|
f"\n{label} (n={len(df)}):\n"
|
|
f" median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
|
|
f"(x{2 ** np.median(log_ratio):.2f})\n"
|
|
f" median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
|
|
f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
|
|
f" within factor 2: {within2:.0%}\n"
|
|
f" Spearman rank corr: {rank:.2f}"
|
|
)
|
|
|
|
for phase in ("primary", "secondary"):
|
|
report(
|
|
binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
|
|
f"BINDING, non-faith, {phase}",
|
|
)
|
|
report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
|
|
|
|
offered = joined.filter(pl.col("all_offered"))
|
|
if not offered.is_empty():
|
|
unbound_share = float((~offered["filled"]).mean())
|
|
print(
|
|
f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
|
|
f"cutoff for {unbound_share:.0%}; median modelled radius "
|
|
f"{offered['radius_km'].median():.2f} km"
|
|
)
|
|
return binding
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Compare modelled catchment radii with published cutoffs"
|
|
)
|
|
parser.add_argument(
|
|
"--ground-truth-dir",
|
|
type=Path,
|
|
default=Path("property-data/ground_truth"),
|
|
)
|
|
parser.add_argument(
|
|
"--radii",
|
|
type=Path,
|
|
default=Path("property-data/school_catchment_radii.parquet"),
|
|
help="Per-school radii parquet from school_catchments --schools-output",
|
|
)
|
|
parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
|
|
parser.add_argument(
|
|
"--matched-out",
|
|
type=Path,
|
|
default=None,
|
|
help="Optional CSV of matched rows for inspection",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
truth = load_ground_truth(args.ground_truth_dir)
|
|
# One row per school+phase: keep the most recent entry year.
|
|
truth = (
|
|
truth.sort("entry_year", descending=True)
|
|
.unique(subset=["school_name", "la", "phase"], keep="first")
|
|
)
|
|
gias = pl.read_parquet(args.gias).select(
|
|
"urn", "name", "postcode", "local_authority", "religious_character"
|
|
)
|
|
radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
|
|
|
|
matched = match_schools(truth, gias.drop("religious_character"))
|
|
# GIAS religious character is authoritative; the scraped name-based flag
|
|
# only covers rows that failed to match.
|
|
matched = matched.join(
|
|
gias.select("urn", "religious_character"), on="urn", how="left"
|
|
).with_columns(
|
|
pl.when(pl.col("religious_character").is_not_null())
|
|
.then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
|
|
.otherwise(pl.col("faith_school"))
|
|
.alias("faith_school")
|
|
)
|
|
binding = evaluate(matched, radii)
|
|
|
|
if args.matched_out is not None:
|
|
out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
|
|
"_row_id", "_name_norm", "_la_norm", "_pc"
|
|
)
|
|
args.matched_out.parent.mkdir(parents=True, exist_ok=True)
|
|
out.write_csv(args.matched_out)
|
|
print(f"\nWrote matched rows to {args.matched_out}")
|
|
|
|
if binding.is_empty():
|
|
raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|