199 lines
8.1 KiB
Python
199 lines
8.1 KiB
Python
"""Compute Ofsted-rated school proximity counts per postcode."""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.utils.poi_counts import count_pois_per_postcode
|
|
|
|
SCHOOL_GROUPS = {
|
|
"good_primary": ["good_primary", "outstanding_primary"],
|
|
"good_secondary": ["good_secondary", "outstanding_secondary"],
|
|
"outstanding_primary": ["outstanding_primary"],
|
|
"outstanding_secondary": ["outstanding_secondary"],
|
|
}
|
|
|
|
|
|
# Age thresholds for deciding which phase(s) a school serves. A school serves
|
|
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
|
|
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
|
|
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
|
|
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
|
|
# phase" labels such schools as just "Secondary", which previously hid them from
|
|
# every postcode's primary-school count.
|
|
PRIMARY_MAX_AGE = 10
|
|
SECONDARY_MIN_AGE = 12
|
|
|
|
|
|
def classify_good_plus_schools(
|
|
ofsted: pl.DataFrame, open_urns: set[int] | None = None
|
|
) -> pl.DataFrame:
|
|
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
|
|
|
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
|
|
``category`` rows per school, returning a ``(postcode, category)`` frame.
|
|
|
|
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
|
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
|
Framework). A large and growing share of schools were last inspected under an
|
|
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
|
that column is null/"Not judged" for them even when they are demonstrably
|
|
good — their status lives in "Ungraded inspection overall outcome" ("School
|
|
remains Good"/"School remains Outstanding"). Filtering on the graded column
|
|
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
|
|
ungraded outcome, but ONLY when there is no usable graded result
|
|
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
|
|
|
|
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
|
|
(Concerns)" outcome signals inspectors found issues warranting an earlier
|
|
graded re-inspection, so marketing it as a good+ school is misleading.
|
|
|
|
Phase assignment uses the statutory age range when available (so all-through
|
|
and middle schools count toward BOTH primary and secondary), falling back to
|
|
the coarse "Ofsted phase" label when age columns are absent. When
|
|
``open_urns`` is given, schools whose URN is not in the current GIAS open
|
|
register are dropped so closed/merged schools are not counted.
|
|
"""
|
|
# Cast to Utf8 so the string predicates below are well-defined even if a
|
|
# column happens to be entirely null (read back as a Null dtype).
|
|
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
|
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
|
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
|
has_concern = ungraded.str.contains(r"\(Concerns\)")
|
|
remains_outstanding = (
|
|
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
|
|
)
|
|
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
|
|
graded = (
|
|
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
|
.with_columns(
|
|
pl.when(oeif.is_in(["1", "2"]))
|
|
.then(oeif)
|
|
.when(no_usable_grade & remains_outstanding)
|
|
.then(pl.lit("1"))
|
|
.when(no_usable_grade & remains_good)
|
|
.then(pl.lit("2"))
|
|
.otherwise(None)
|
|
.alias("_ofsted_grade")
|
|
)
|
|
.filter(pl.col("_ofsted_grade").is_not_null())
|
|
)
|
|
|
|
# Drop schools no longer open (closed/merged) when the GIAS open register is
|
|
# provided, so stale Ofsted "latest inspection" rows are not counted.
|
|
if open_urns is not None and "URN" in graded.columns:
|
|
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
|
|
|
|
# Decide which phase(s) each school serves.
|
|
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
|
|
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
|
|
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
|
|
serves_primary = (
|
|
pl.when(low.is_not_null())
|
|
.then(low <= PRIMARY_MAX_AGE)
|
|
.otherwise(pl.col("Ofsted phase") == "Primary")
|
|
)
|
|
serves_secondary = (
|
|
pl.when(high.is_not_null())
|
|
.then(high >= SECONDARY_MIN_AGE)
|
|
.otherwise(pl.col("Ofsted phase") == "Secondary")
|
|
)
|
|
else:
|
|
serves_primary = pl.col("Ofsted phase") == "Primary"
|
|
serves_secondary = pl.col("Ofsted phase") == "Secondary"
|
|
|
|
graded = graded.with_columns(
|
|
serves_primary.alias("_serves_primary"),
|
|
serves_secondary.alias("_serves_secondary"),
|
|
)
|
|
|
|
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
|
# A school can yield up to two rows (primary and secondary).
|
|
primary = graded.filter(pl.col("_serves_primary")).with_columns(
|
|
pl.when(pl.col("_ofsted_grade") == "1")
|
|
.then(pl.lit("outstanding_primary"))
|
|
.otherwise(pl.lit("good_primary"))
|
|
.alias("category")
|
|
)
|
|
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
|
|
pl.when(pl.col("_ofsted_grade") == "1")
|
|
.then(pl.lit("outstanding_secondary"))
|
|
.otherwise(pl.lit("good_secondary"))
|
|
.alias("category")
|
|
)
|
|
return pl.concat([primary, secondary]).select(
|
|
pl.col("Postcode").alias("postcode"),
|
|
"category",
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Count good+ and outstanding primary/secondary schools near each postcode"
|
|
)
|
|
parser.add_argument(
|
|
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--gias",
|
|
type=Path,
|
|
default=None,
|
|
help="GIAS open-school parquet; if given, only currently-open schools are counted",
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
open_urns: set[int] | None = None
|
|
if args.gias is not None:
|
|
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
|
|
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
|
|
print(f"GIAS open register: {len(open_urns):,} open school URNs")
|
|
|
|
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
|
|
if ofsted.is_empty():
|
|
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
|
|
|
print(f"Good+ schools: {len(ofsted):,}")
|
|
print(
|
|
"Outstanding schools: "
|
|
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
|
|
)
|
|
|
|
# Join with arcgis to get lat/lng for each school's postcode
|
|
arcgis = pl.read_parquet(args.arcgis).select(
|
|
pl.col("pcds").alias("postcode"),
|
|
"lat",
|
|
pl.col("long").alias("lng"),
|
|
)
|
|
|
|
schools = ofsted.join(arcgis, on="postcode", how="inner")
|
|
if schools.is_empty():
|
|
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
|
|
print(f"Schools with coordinates: {len(schools):,}")
|
|
|
|
# Load all postcodes for proximity counting
|
|
postcodes = arcgis.rename({"lng": "lon"})
|
|
|
|
counts_5km = count_pois_per_postcode(
|
|
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
|
|
)
|
|
counts_2km = count_pois_per_postcode(
|
|
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
|
|
)
|
|
|
|
result = counts_5km.join(counts_2km, on="postcode")
|
|
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
result.write_parquet(args.output)
|
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|