Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
 }


+def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
+    """Label good+/outstanding primary & secondary schools for proximity counts.
+
+    Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
+    returning a ``(postcode, category)`` frame.
+
+    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
+    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
+    Framework). A large and growing share of schools were last inspected under an
+    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
+    that column is null/"Not judged" for them even when they are demonstrably
+    good — their status lives in "Ungraded inspection overall outcome" ("School
+    remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
+    variants). Filtering on the graded column alone dropped ~7,000 genuinely
+    good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
+    there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
+    is never overridden.
+    """
+    # Cast to Utf8 so the string predicates below are well-defined even if a
+    # column happens to be entirely null (read back as a Null dtype).
+    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
+    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
+    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    graded = (
+        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
+        .with_columns(
+            pl.when(oeif.is_in(["1", "2"]))
+            .then(oeif)
+            .when(
+                no_usable_grade
+                & ungraded.str.starts_with("School remains Outstanding")
+            )
+            .then(pl.lit("1"))
+            .when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
+            .then(pl.lit("2"))
+            .otherwise(None)
+            .alias("_ofsted_grade")
+        )
+        .filter(pl.col("_ofsted_grade").is_not_null())
+    )
+    # Good+ groups include both grade variants; outstanding groups count grade 1.
+    return graded.with_columns(
+        pl.when(pl.col("Ofsted phase") == "Primary")
+        .then(
+            pl.when(pl.col("_ofsted_grade") == "1")
+            .then(pl.lit("outstanding_primary"))
+            .otherwise(pl.lit("good_primary"))
+        )
+        .otherwise(
+            pl.when(pl.col("_ofsted_grade") == "1")
+            .then(pl.lit("outstanding_secondary"))
+            .otherwise(pl.lit("good_secondary"))
+        )
+        .alias("category")
+    ).select(
+        pl.col("Postcode").alias("postcode"),
+        "category",
+    )
+
+
 def main():
    parser = argparse.ArgumentParser(
        description="Count good+ and outstanding primary/secondary schools near each postcode"
@ -30,42 +90,14 @@ def main():
    )
    args = parser.parse_args()

-    # Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
-    # Post-2025 reform the single "Overall effectiveness" grade was retired;
-    # the legacy 1–4 scale is now carried forward under "Latest OEIF overall
-    # effectiveness" (OEIF = the previous Ofsted Education Inspection
-    # Framework). The new report-card columns use text judgements instead.
-    ofsted = pl.read_parquet(args.ofsted).filter(
-        pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
-        & pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
-    )
+    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
    if ofsted.is_empty():
        raise ValueError("No good+ primary/secondary Ofsted schools found")

    print(f"Good+ schools: {len(ofsted):,}")
    print(
        "Outstanding schools: "
-        f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
-    )
-
-    # Assign category based on phase and rating. Good+ groups include both
-    # category variants; outstanding groups count grade 1 only.
-    ofsted = ofsted.with_columns(
-        pl.when(pl.col("Ofsted phase") == "Primary")
-        .then(
-            pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
-            .then(pl.lit("outstanding_primary"))
-            .otherwise(pl.lit("good_primary"))
-        )
-        .otherwise(
-            pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
-            .then(pl.lit("outstanding_secondary"))
-            .otherwise(pl.lit("good_secondary"))
-        )
-        .alias("category")
-    ).select(
-        pl.col("Postcode").alias("postcode"),
-        "category",
+        f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
    )

    # Join with arcgis to get lat/lng for each school's postcode