Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
|
|||
}
|
||||
|
||||
|
||||
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
|
||||
returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
Framework). A large and growing share of schools were last inspected under an
|
||||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
|
||||
variants). Filtering on the graded column alone dropped ~7,000 genuinely
|
||||
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
|
||||
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
|
||||
is never overridden.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(
|
||||
no_usable_grade
|
||||
& ungraded.str.starts_with("School remains Outstanding")
|
||||
)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
return graded.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Count good+ and outstanding primary/secondary schools near each postcode"
|
||||
|
|
@ -30,42 +90,14 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
|
||||
# Post-2025 reform the single "Overall effectiveness" grade was retired;
|
||||
# the legacy 1–4 scale is now carried forward under "Latest OEIF overall
|
||||
# effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
# Framework). The new report-card columns use text judgements instead.
|
||||
ofsted = pl.read_parquet(args.ofsted).filter(
|
||||
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
|
||||
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
|
||||
)
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
print(f"Good+ schools: {len(ofsted):,}")
|
||||
print(
|
||||
"Outstanding schools: "
|
||||
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
|
||||
)
|
||||
|
||||
# Assign category based on phase and rating. Good+ groups include both
|
||||
# category variants; outstanding groups count grade 1 only.
|
||||
ofsted = ofsted.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
|
||||
)
|
||||
|
||||
# Join with arcgis to get lat/lng for each school's postcode
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue