"""Compute Ofsted-rated school proximity counts per postcode.""" import argparse from pathlib import Path import polars as pl from pipeline.utils.poi_counts import count_pois_per_postcode SCHOOL_GROUPS = { "good_primary": ["good_primary", "outstanding_primary"], "good_secondary": ["good_secondary", "outstanding_secondary"], "outstanding_primary": ["outstanding_primary"], "outstanding_secondary": ["outstanding_secondary"], } # Age thresholds for deciding which phase(s) a school serves. A school serves # PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age # children if its statutory highest age is >= 12. All-through (e.g. 3-18) and # middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in # both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted # phase" labels such schools as just "Secondary", which previously hid them from # every postcode's primary-school count. PRIMARY_MAX_AGE = 10 SECONDARY_MIN_AGE = 12 def classify_good_plus_schools( ofsted: pl.DataFrame, open_urns: set[int] | None = None ) -> pl.DataFrame: """Label good+/outstanding primary & secondary schools for proximity counts. Derives a grade ("1" = outstanding, "2" = good) and one or two proximity ``category`` rows per school, returning a ``(postcode, category)`` frame. Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF overall effectiveness" (OEIF = the previous Ofsted Education Inspection Framework). A large and growing share of schools were last inspected under an UNGRADED (Section 8) inspection or the post-2024 report-card framework, so that column is null/"Not judged" for them even when they are demonstrably good — their status lives in "Ungraded inspection overall outcome" ("School remains Good"/"School remains Outstanding"). Filtering on the graded column alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the ungraded outcome, but ONLY when there is no usable graded result (null/"Not judged"), so a genuine grade 3/4 is never overridden. Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good (Concerns)" outcome signals inspectors found issues warranting an earlier graded re-inspection, so marketing it as a good+ school is misleading. Phase assignment uses the statutory age range when available (so all-through and middle schools count toward BOTH primary and secondary), falling back to the coarse "Ofsted phase" label when age columns are absent. When ``open_urns`` is given, schools whose URN is not in the current GIAS open register are dropped so closed/merged schools are not counted. """ # Cast to Utf8 so the string predicates below are well-defined even if a # column happens to be entirely null (read back as a Null dtype). oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False) ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False) no_usable_grade = oeif.is_null() | (oeif == "Not judged") has_concern = ungraded.str.contains(r"\(Concerns\)") remains_outstanding = ( ungraded.str.starts_with("School remains Outstanding") & ~has_concern ) remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern graded = ( ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"])) .with_columns( pl.when(oeif.is_in(["1", "2"])) .then(oeif) .when(no_usable_grade & remains_outstanding) .then(pl.lit("1")) .when(no_usable_grade & remains_good) .then(pl.lit("2")) .otherwise(None) .alias("_ofsted_grade") ) .filter(pl.col("_ofsted_grade").is_not_null()) ) # Drop schools no longer open (closed/merged) when the GIAS open register is # provided, so stale Ofsted "latest inspection" rows are not counted. if open_urns is not None and "URN" in graded.columns: graded = graded.filter(pl.col("URN").is_in(list(open_urns))) # Decide which phase(s) each school serves. if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns): low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False) high = pl.col("Statutory highest age").cast(pl.Int64, strict=False) serves_primary = ( pl.when(low.is_not_null()) .then(low <= PRIMARY_MAX_AGE) .otherwise(pl.col("Ofsted phase") == "Primary") ) serves_secondary = ( pl.when(high.is_not_null()) .then(high >= SECONDARY_MIN_AGE) .otherwise(pl.col("Ofsted phase") == "Secondary") ) else: serves_primary = pl.col("Ofsted phase") == "Primary" serves_secondary = pl.col("Ofsted phase") == "Secondary" graded = graded.with_columns( serves_primary.alias("_serves_primary"), serves_secondary.alias("_serves_secondary"), ) # Good+ groups include both grade variants; outstanding groups count grade 1. # A school can yield up to two rows (primary and secondary). primary = graded.filter(pl.col("_serves_primary")).with_columns( pl.when(pl.col("_ofsted_grade") == "1") .then(pl.lit("outstanding_primary")) .otherwise(pl.lit("good_primary")) .alias("category") ) secondary = graded.filter(pl.col("_serves_secondary")).with_columns( pl.when(pl.col("_ofsted_grade") == "1") .then(pl.lit("outstanding_secondary")) .otherwise(pl.lit("good_secondary")) .alias("category") ) return pl.concat([primary, secondary]).select( pl.col("Postcode").alias("postcode"), "category", ) def main(): parser = argparse.ArgumentParser( description="Count good+ and outstanding primary/secondary schools near each postcode" ) parser.add_argument( "--ofsted", type=Path, required=True, help="Ofsted inspection parquet" ) parser.add_argument( "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet" ) parser.add_argument( "--gias", type=Path, default=None, help="GIAS open-school parquet; if given, only currently-open schools are counted", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet path" ) args = parser.parse_args() open_urns: set[int] | None = None if args.gias is not None: gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls() open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list()) print(f"GIAS open register: {len(open_urns):,} open school URNs") ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns) if ofsted.is_empty(): raise ValueError("No good+ primary/secondary Ofsted schools found") print(f"Good+ schools: {len(ofsted):,}") print( "Outstanding schools: " f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}" ) # Join with arcgis to get lat/lng for each school's postcode arcgis = pl.read_parquet(args.arcgis).select( pl.col("pcds").alias("postcode"), "lat", pl.col("long").alias("lng"), ) schools = ofsted.join(arcgis, on="postcode", how="inner") if schools.is_empty(): raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates") print(f"Schools with coordinates: {len(schools):,}") # Load all postcodes for proximity counting postcodes = arcgis.rename({"lng": "lon"}) counts_5km = count_pois_per_postcode( postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS ) counts_2km = count_pois_per_postcode( postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS ) result = counts_5km.join(counts_2km, on="postcode") args.output.parent.mkdir(parents=True, exist_ok=True) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB)") if __name__ == "__main__": main()