Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
297
pipeline/check_school_cutoffs.py
Normal file
297
pipeline/check_school_cutoffs.py
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
"""Evaluate modelled school catchment radii against published cutoffs.
|
||||
|
||||
Local authorities publish each school's "last distance offered" in their
|
||||
yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
|
||||
holds a scraped sample of those figures (see the collection notes in each
|
||||
file's ``source_url`` fields). This script matches them to the per-school
|
||||
radii emitted by ``pipeline.transform.school_catchments --schools-output``
|
||||
and reports how well the model reproduces reality, so the preference-bonus
|
||||
constants can be calibrated.
|
||||
|
||||
Headline metrics use non-faith schools whose published cutoff was a binding
|
||||
distance. Faith schools are reported separately (their distance criterion
|
||||
applies within faith priority, so published figures aren't comparable), as
|
||||
are "all applicants offered" schools, where the model should ideally show no
|
||||
binding cutoff.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import difflib
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
_NOISE_WORDS = re.compile(
|
||||
r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
|
||||
)
|
||||
_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
|
||||
_SCHOOL_WORDS = re.compile(
|
||||
r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
|
||||
)
|
||||
|
||||
|
||||
def normalize_name(name: str, strip_school_words: bool = False) -> str:
|
||||
s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
|
||||
s = _NON_ALNUM.sub(" ", s)
|
||||
s = _NOISE_WORDS.sub(" ", s)
|
||||
if strip_school_words:
|
||||
s = _SCHOOL_WORDS.sub(" ", s)
|
||||
return " ".join(s.split())
|
||||
|
||||
|
||||
def normalize_la(la: str) -> str:
|
||||
s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
|
||||
return " ".join(s.replace("city of", "").split())
|
||||
|
||||
|
||||
def load_ground_truth(directory: Path) -> pl.DataFrame:
|
||||
rows = []
|
||||
for path in sorted(directory.glob("cutoffs_*.json")):
|
||||
for row in json.loads(path.read_text()):
|
||||
rows.append(
|
||||
{
|
||||
"school_name": row["school_name"],
|
||||
"la": row["la"],
|
||||
"phase": row["phase"],
|
||||
"entry_year": int(row.get("entry_year") or 0),
|
||||
"cutoff_km": (
|
||||
float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
|
||||
),
|
||||
"all_offered": bool(row.get("all_offered", False)),
|
||||
"faith_school": bool(row.get("faith_school", False)),
|
||||
"school_postcode": row.get("school_postcode"),
|
||||
"source_url": row.get("source_url", ""),
|
||||
}
|
||||
)
|
||||
if not rows:
|
||||
raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
|
||||
df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
|
||||
print(f"Ground truth rows: {len(df)} from {directory}")
|
||||
return df
|
||||
|
||||
|
||||
def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Attach GIAS URNs to ground-truth rows by postcode, then name."""
|
||||
def stripped(name: str) -> str:
|
||||
return normalize_name(name, strip_school_words=True)
|
||||
|
||||
gias = gias.with_columns(
|
||||
pl.col("name")
|
||||
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
||||
.alias("_name_norm"),
|
||||
pl.col("name")
|
||||
.map_elements(stripped, return_dtype=pl.Utf8)
|
||||
.alias("_name_stripped"),
|
||||
pl.col("local_authority")
|
||||
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
||||
.alias("_la_norm"),
|
||||
pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
|
||||
)
|
||||
truth = truth.with_columns(
|
||||
pl.col("school_name")
|
||||
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
||||
.alias("_name_norm"),
|
||||
pl.col("school_name")
|
||||
.map_elements(stripped, return_dtype=pl.Utf8)
|
||||
.alias("_name_stripped"),
|
||||
pl.col("la")
|
||||
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
||||
.alias("_la_norm"),
|
||||
pl.col("school_postcode")
|
||||
.str.replace_all(" ", "")
|
||||
.str.to_uppercase()
|
||||
.alias("_pc"),
|
||||
).with_row_index("_row_id")
|
||||
|
||||
# 1. Exact postcode match (unique postcodes only — site-sharing schools
|
||||
# would mismatch phases otherwise; those fall through to name matching).
|
||||
pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
|
||||
subset="_pc", keep="none"
|
||||
)
|
||||
by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
|
||||
pc_unique.select("_pc", "urn"), on="_pc", how="inner"
|
||||
)
|
||||
matched_ids = set(by_pc["_row_id"].to_list())
|
||||
|
||||
# 2. Exact normalized (name, LA) match, unique on both sides.
|
||||
gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
|
||||
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
||||
by_name = remaining.join(
|
||||
gias_named.select("_name_norm", "_la_norm", "urn"),
|
||||
on=["_name_norm", "_la_norm"],
|
||||
how="inner",
|
||||
)
|
||||
matched_ids |= set(by_name["_row_id"].to_list())
|
||||
|
||||
# 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
|
||||
# Primary School"): match on names with school-type words stripped,
|
||||
# unique on both sides so site-sharing infant/junior pairs fall through.
|
||||
gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
|
||||
subset=["_name_stripped", "_la_norm"], keep="none"
|
||||
)
|
||||
remaining = truth.filter(
|
||||
(~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
|
||||
).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
|
||||
by_stripped = remaining.join(
|
||||
gias_stripped.select("_name_stripped", "_la_norm", "urn"),
|
||||
on=["_name_stripped", "_la_norm"],
|
||||
how="inner",
|
||||
)
|
||||
matched_ids |= set(by_stripped["_row_id"].to_list())
|
||||
|
||||
# 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
|
||||
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
||||
fuzzy_rows = []
|
||||
gias_by_la: dict[str, pl.DataFrame] = {}
|
||||
for row in remaining.iter_rows(named=True):
|
||||
la = row["_la_norm"]
|
||||
if la not in gias_by_la:
|
||||
gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
|
||||
candidates = gias_by_la[la]
|
||||
if candidates.is_empty():
|
||||
continue
|
||||
scores = [
|
||||
difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
|
||||
for cand in candidates["_name_norm"].to_list()
|
||||
]
|
||||
order = np.argsort(scores)[::-1]
|
||||
if scores[order[0]] >= 0.87 and (
|
||||
len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
|
||||
):
|
||||
fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
|
||||
by_fuzzy = (
|
||||
pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
|
||||
if fuzzy_rows
|
||||
else None
|
||||
)
|
||||
|
||||
parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
|
||||
matched = pl.concat(
|
||||
[p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
|
||||
).unique(subset="_row_id", keep="first")
|
||||
print(
|
||||
f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
|
||||
f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
|
||||
f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
|
||||
)
|
||||
return matched
|
||||
|
||||
|
||||
def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
|
||||
joined = matched.join(radii, on=["urn", "phase"], how="inner")
|
||||
print(f"Joined to modelled radii: {len(joined)} rows")
|
||||
|
||||
# Published figures occasionally include non-typical admits (a child who
|
||||
# moved mid-process can print as hundreds of km); cap at distances a
|
||||
# distance criterion can plausibly produce.
|
||||
binding = joined.filter(
|
||||
~pl.col("all_offered")
|
||||
& pl.col("cutoff_km").is_between(0.05, 20.0)
|
||||
)
|
||||
|
||||
def report(df: pl.DataFrame, label: str) -> None:
|
||||
if df.is_empty():
|
||||
print(f"\n{label}: no rows")
|
||||
return
|
||||
truth_km = df["cutoff_km"].to_numpy()
|
||||
model_km = df["radius_km"].to_numpy()
|
||||
log_ratio = np.log2(model_km / truth_km)
|
||||
within2 = float(np.mean(np.abs(log_ratio) <= 1))
|
||||
rank = (
|
||||
pl.DataFrame({"t": truth_km, "m": model_km})
|
||||
.select(pl.corr("t", "m", method="spearman"))
|
||||
.item()
|
||||
)
|
||||
print(
|
||||
f"\n{label} (n={len(df)}):\n"
|
||||
f" median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
|
||||
f"(x{2 ** np.median(log_ratio):.2f})\n"
|
||||
f" median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
|
||||
f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
|
||||
f" within factor 2: {within2:.0%}\n"
|
||||
f" Spearman rank corr: {rank:.2f}"
|
||||
)
|
||||
|
||||
for phase in ("primary", "secondary"):
|
||||
report(
|
||||
binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
|
||||
f"BINDING, non-faith, {phase}",
|
||||
)
|
||||
report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
|
||||
|
||||
offered = joined.filter(pl.col("all_offered"))
|
||||
if not offered.is_empty():
|
||||
unbound_share = float((~offered["filled"]).mean())
|
||||
print(
|
||||
f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
|
||||
f"cutoff for {unbound_share:.0%}; median modelled radius "
|
||||
f"{offered['radius_km'].median():.2f} km"
|
||||
)
|
||||
return binding
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare modelled catchment radii with published cutoffs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ground-truth-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/ground_truth"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--radii",
|
||||
type=Path,
|
||||
default=Path("property-data/school_catchment_radii.parquet"),
|
||||
help="Per-school radii parquet from school_catchments --schools-output",
|
||||
)
|
||||
parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
|
||||
parser.add_argument(
|
||||
"--matched-out",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional CSV of matched rows for inspection",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
truth = load_ground_truth(args.ground_truth_dir)
|
||||
# One row per school+phase: keep the most recent entry year.
|
||||
truth = (
|
||||
truth.sort("entry_year", descending=True)
|
||||
.unique(subset=["school_name", "la", "phase"], keep="first")
|
||||
)
|
||||
gias = pl.read_parquet(args.gias).select(
|
||||
"urn", "name", "postcode", "local_authority", "religious_character"
|
||||
)
|
||||
radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
|
||||
|
||||
matched = match_schools(truth, gias.drop("religious_character"))
|
||||
# GIAS religious character is authoritative; the scraped name-based flag
|
||||
# only covers rows that failed to match.
|
||||
matched = matched.join(
|
||||
gias.select("urn", "religious_character"), on="urn", how="left"
|
||||
).with_columns(
|
||||
pl.when(pl.col("religious_character").is_not_null())
|
||||
.then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
|
||||
.otherwise(pl.col("faith_school"))
|
||||
.alias("faith_school")
|
||||
)
|
||||
binding = evaluate(matched, radii)
|
||||
|
||||
if args.matched_out is not None:
|
||||
out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
|
||||
"_row_id", "_name_norm", "_la_norm", "_pc"
|
||||
)
|
||||
args.matched_out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_csv(args.matched_out)
|
||||
print(f"\nWrote matched rows to {args.matched_out}")
|
||||
|
||||
if binding.is_empty():
|
||||
raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue