Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -0,0 +1,82 @@
import polars as pl
from pipeline.transform.school_proximity import classify_good_plus_schools
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, "AA1 1AA"),
_school("Primary", "2", None, "AA1 1AB"),
_school("Secondary", "1", None, "AA1 1AC"),
_school("Secondary", "2", None, "AA1 1AD"),
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_primary"),
("AA1 1AC", "outstanding_secondary"),
("AA1 1AD", "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Concerns)"/"(Improving)" variants are still good+.
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
"AA1 1AD",
),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AC", "good_primary"),
("AA1 1AD", "outstanding_secondary"),
}
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()