Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -0,0 +1,354 @@
import numpy as np
import polars as pl
from pipeline.transform.school_catchments import (
capacity_fill_radii,
children_per_postcode,
classify_good_plus_schools,
count_covering_catchments,
equilibrium_cutoffs,
phase_intakes,
school_preference_bonuses,
)
def _school(phase, oeif, ungraded, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["urn"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, 1),
_school("Primary", "2", None, 2),
_school("Secondary", "1", None, 3),
_school("Secondary", "2", None, 4),
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_primary"),
(3, "outstanding_secondary"),
(4, "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", 1),
_school("Secondary", "Not judged", "School remains Outstanding", 2),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
]
assert _classify(rows) == {
(1, "good_primary"),
(2, "outstanding_secondary"),
(3, "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
2,
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, 1)]
assert _classify(rows) == {
(1, "good_primary"),
(1, "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, 1), # primary only
_aged_school("Secondary", "2", 11, 16, 2), # secondary only
_aged_school("Secondary", "1", 9, 13, 3), # middle -> both
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_secondary"),
(3, "outstanding_primary"),
(3, "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, 111),
_aged_school("Secondary", "2", 11, 16, 222),
]
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {(111, "outstanding_primary")}
def _gias_row(
urn,
type_group="Academies",
age_range="411",
pupils=210,
capacity=None,
admissions_policy=None,
):
return {
"urn": urn,
"name": f"School {urn}",
"lat": 51.5,
"lng": -0.1,
"type_group": type_group,
"age_range": age_range,
"pupils": pupils,
"capacity": capacity,
"admissions_policy": admissions_policy,
}
def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
intakes = phase_intakes(
pl.DataFrame(
[
# 4-11 = cohorts 4..10, all 7 primary: full fill target.
_gias_row(1, age_range="411", pupils=210),
# 11-16 = cohorts 11..15, all 5 secondary.
_gias_row(2, age_range="1116", pupils=500),
# 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
# gets 7 of 7.5 cohort weights.
_gias_row(3, age_range="311", pupils=240),
# All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
_gias_row(4, age_range="416", pupils=1200),
# 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
# secondary gets 5 of 6.2 cohort weights.
_gias_row(5, age_range="1118", pupils=1240),
]
)
).sort("urn")
assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
def test_phase_intakes_excludes_non_state_and_selective_schools():
intakes = phase_intakes(
pl.DataFrame(
[
_gias_row(1, type_group="Independent schools"),
_gias_row(2, type_group="Special schools"),
_gias_row(3, type_group="Welsh schools"),
# Grammar school intakes are test-based and region-wide; a
# distance catchment would be fabricated.
_gias_row(4, admissions_policy="Selective"),
_gias_row(5, pupils=None, capacity=300),
_gias_row(6, pupils=None, capacity=None), # no usable headcount
_gias_row(7, age_range=None), # no parsable cohorts
# Over-full school keeps its demonstrated size.
_gias_row(8, pupils=350, capacity=300),
_gias_row(9, admissions_policy="Non-selective"),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [5, 8, 9]
assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
def test_school_preference_bonuses_follow_derived_grade():
rows = [
{**_school("Primary", "1", None, 1)},
{**_school("Primary", "2", None, 2)},
{**_school("Primary", "3", None, 3)},
{**_school("Primary", "4", None, 4)},
{**_school("Primary", None, "Some aspects not as strong", 5)}, # unrated
{**_school("Primary", "Not judged", "School remains Good", 6)},
]
bonuses = dict(
school_preference_bonuses(
pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
).iter_rows()
)
assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
"lat": [51.5, 51.5, 52.0],
"lng": [-0.1, -0.1, -0.2],
"lsoa21cd": ["E01000001", "E01000001", "E01000002"],
}
)
lsoa_children = pl.DataFrame(
{
"lsoa21": ["E01000001", "E01000002"],
"aged_0_4": [100, 30],
"aged_5_9": [100, 10],
"aged_10_14": [100, 20],
"aged_15_19": [100, 40],
}
)
result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
# Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
# the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
# Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
# One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
# each. The two nearest postcodes exactly fill it, so the cutoff is the
# marginal admitted child's distance and the 3km postcode is shut out.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0]]),
np.array([10.0]),
np.array([0.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs.tolist() == [2.0]
def test_equilibrium_rejected_demand_cascades_to_next_school():
# School A (5 places) at the origin, school B (5 places) at 10km.
# P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
# with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
# exceeds its target, so it keeps no binding cutoff.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [10.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0], [1.5, 0.0]]),
np.array([5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_preference_bonus_steers_demand_to_better_school():
# Two schools equidistant from the only postcode; school A is rated
# better (0.5km bonus) so all children choose it; B attracts nobody.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.5, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_logit_choice_smears_demand_across_schools():
# With a positive temperature some families prefer the further school, so
# both schools receive applications: the near school still fills and keeps
# a binding cutoff, and the far school now attracts mass it would never
# see under deterministic choice.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([4.0, 4.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=1.0,
)
# Each school gets half the 10 children (equidistant, equal utility),
# exceeding both fill targets: both cutoffs bind at the postcode.
assert cutoffs.tolist() == [1.0, 1.0]
def test_capacity_fill_radii_covers_fill_target_population():
# Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
# cumulate past the target at 2km. A school needing more children than
# exist within the cap keeps the cap.
radii = capacity_fill_radii(
np.array([[0.0, 0.0], [0.0, 0.0]]),
np.array([6.0, 1000.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
max_radius_km=25.0,
)
assert radii.tolist() == [2.0, 25.0]
def test_count_covering_catchments_respects_radius_and_validity():
pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
pc_valid = np.array([True, True, True, False])
school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
radii = np.array([4.0, 1.5])
counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
# pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
# pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
assert counts.tolist() == [1, 2, 0, 0]
def test_count_covering_catchments_empty_schools():
counts = count_covering_catchments(
np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
)
assert counts.tolist() == [0, 0]