perfect-postcode/pipeline/transform/school_proximity.py

134 lines
5 KiB
Python

"""Compute Ofsted-rated school proximity counts per postcode."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good — their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
variants). Filtering on the graded column alone dropped ~7,000 genuinely
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
is never overridden.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(
no_usable_grade
& ungraded.str.starts_with("School remains Outstanding")
)
.then(pl.lit("1"))
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
return graded.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
)
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
)
# Join with arcgis to get lat/lng for each school's postcode
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
if schools.is_empty():
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
counts_5km = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)
counts_2km = count_pois_per_postcode(
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
)
result = counts_5km.join(counts_2km, on="postcode")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()