perfect-postcode/pipeline/transform/school_proximity.py

"""Compute Ofsted-rated school proximity counts per postcode."""

import argparse
from pathlib import Path

import polars as pl

from pipeline.utils.poi_counts import count_pois_per_postcode

SCHOOL_GROUPS = {
    "good_primary": ["good_primary", "outstanding_primary"],
    "good_secondary": ["good_secondary", "outstanding_secondary"],
    "outstanding_primary": ["outstanding_primary"],
    "outstanding_secondary": ["outstanding_secondary"],
}


def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
    """Label good+/outstanding primary & secondary schools for proximity counts.

    Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
    returning a ``(postcode, category)`` frame.

    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
    Framework). A large and growing share of schools were last inspected under an
    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
    that column is null/"Not judged" for them even when they are demonstrably
    good — their status lives in "Ungraded inspection overall outcome" ("School
    remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
    variants). Filtering on the graded column alone dropped ~7,000 genuinely
    good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
    there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
    is never overridden.
    """
    # Cast to Utf8 so the string predicates below are well-defined even if a
    # column happens to be entirely null (read back as a Null dtype).
    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
    graded = (
        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
        .with_columns(
            pl.when(oeif.is_in(["1", "2"]))
            .then(oeif)
            .when(
                no_usable_grade
                & ungraded.str.starts_with("School remains Outstanding")
            )
            .then(pl.lit("1"))
            .when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
            .then(pl.lit("2"))
            .otherwise(None)
            .alias("_ofsted_grade")
        )
        .filter(pl.col("_ofsted_grade").is_not_null())
    )
    # Good+ groups include both grade variants; outstanding groups count grade 1.
    return graded.with_columns(
        pl.when(pl.col("Ofsted phase") == "Primary")
        .then(
            pl.when(pl.col("_ofsted_grade") == "1")
            .then(pl.lit("outstanding_primary"))
            .otherwise(pl.lit("good_primary"))
        )
        .otherwise(
            pl.when(pl.col("_ofsted_grade") == "1")
            .then(pl.lit("outstanding_secondary"))
            .otherwise(pl.lit("good_secondary"))
        )
        .alias("category")
    ).select(
        pl.col("Postcode").alias("postcode"),
        "category",
    )


def main():
    parser = argparse.ArgumentParser(
        description="Count good+ and outstanding primary/secondary schools near each postcode"
    )
    parser.add_argument(
        "--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
    )
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet path"
    )
    args = parser.parse_args()

    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
    if ofsted.is_empty():
        raise ValueError("No good+ primary/secondary Ofsted schools found")

    print(f"Good+ schools: {len(ofsted):,}")
    print(
        "Outstanding schools: "
        f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
    )

    # Join with arcgis to get lat/lng for each school's postcode
    arcgis = pl.read_parquet(args.arcgis).select(
        pl.col("pcds").alias("postcode"),
        "lat",
        pl.col("long").alias("lng"),
    )

    schools = ofsted.join(arcgis, on="postcode", how="inner")
    if schools.is_empty():
        raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
    print(f"Schools with coordinates: {len(schools):,}")

    # Load all postcodes for proximity counting
    postcodes = arcgis.rename({"lng": "lon"})

    counts_5km = count_pois_per_postcode(
        postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
    )
    counts_2km = count_pois_per_postcode(
        postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
    )

    result = counts_5km.join(counts_2km, on="postcode")

    args.output.parent.mkdir(parents=True, exist_ok=True)
    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()