Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/transform/test_school_catchments.py
+++ b/pipeline/transform/test_school_catchments.py
@ -0,0 +1,354 @@
+import numpy as np
+import polars as pl
+
+from pipeline.transform.school_catchments import (
+    capacity_fill_radii,
+    children_per_postcode,
+    classify_good_plus_schools,
+    count_covering_catchments,
+    equilibrium_cutoffs,
+    phase_intakes,
+    school_preference_bonuses,
+)
+
+
+def _school(phase, oeif, ungraded, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": ungraded,
+    }
+
+
+def _classify(rows):
+    result = classify_good_plus_schools(pl.DataFrame(rows))
+    return {(r["urn"], r["category"]) for r in result.to_dicts()}
+
+
+def test_legacy_oeif_grades_1_and_2_are_kept():
+    rows = [
+        _school("Primary", "1", None, 1),
+        _school("Primary", "2", None, 2),
+        _school("Secondary", "1", None, 3),
+        _school("Secondary", "2", None, 4),
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_primary"),
+        (3, "outstanding_secondary"),
+        (4, "good_secondary"),
+    }
+
+
+def test_grades_3_and_4_are_excluded():
+    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_remains_good_is_recovered_when_no_graded_result():
+    # Null and "Not judged" OEIF fall back to the ungraded outcome.
+    rows = [
+        _school("Primary", None, "School remains Good", 1),
+        _school("Secondary", "Not judged", "School remains Outstanding", 2),
+        # "(Improving)" is still good+ ...
+        _school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
+    ]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (2, "outstanding_secondary"),
+        (3, "good_primary"),
+    }
+
+
+def test_ungraded_concerns_are_not_good_plus():
+    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
+    # must NOT be counted as good+ schools.
+    rows = [
+        _school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
+        _school(
+            "Secondary",
+            None,
+            "School remains Outstanding (Concerns) - S5 Next",
+            2,
+        ),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_non_good_outcomes_are_excluded():
+    rows = [
+        _school("Primary", None, "Some aspects not as strong"),
+        _school("Primary", None, "Standards maintained"),
+        _school("Primary", None, None),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
+    # A real grade 3 must not be promoted by an ungraded "remains Good".
+    rows = [_school("Primary", "3", "School remains Good")]
+    assert _classify(rows) == set()
+
+
+def test_non_primary_secondary_phases_excluded():
+    rows = [
+        _school("Nursery", "1", None),
+        _school("Not applicable", "2", None),
+    ]
+    assert _classify(rows) == set()
+
+
+def _aged_school(phase, oeif, low, high, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": None,
+        "Statutory lowest age": low,
+        "Statutory highest age": high,
+    }
+
+
+def test_all_through_school_counts_toward_both_primary_and_secondary():
+    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
+    # serves primary-age children too, so it must count in BOTH metrics.
+    rows = [_aged_school("Secondary", "2", 3, 18, 1)]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (1, "good_secondary"),
+    }
+
+
+def test_age_ranges_assign_single_phase_for_standard_schools():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 1),  # primary only
+        _aged_school("Secondary", "2", 11, 16, 2),  # secondary only
+        _aged_school("Secondary", "1", 9, 13, 3),  # middle -> both
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_secondary"),
+        (3, "outstanding_primary"),
+        (3, "outstanding_secondary"),
+    }
+
+
+def test_closed_schools_excluded_when_open_register_given():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 111),
+        _aged_school("Secondary", "2", 11, 16, 222),
+    ]
+    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
+    pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
+    # URN 222 is not in the open register, so it is dropped.
+    assert pairs == {(111, "outstanding_primary")}
+
+
+def _gias_row(
+    urn,
+    type_group="Academies",
+    age_range="4–11",
+    pupils=210,
+    capacity=None,
+    admissions_policy=None,
+):
+    return {
+        "urn": urn,
+        "name": f"School {urn}",
+        "lat": 51.5,
+        "lng": -0.1,
+        "type_group": type_group,
+        "age_range": age_range,
+        "pupils": pupils,
+        "capacity": capacity,
+        "admissions_policy": admissions_policy,
+    }
+
+
+def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                # 4-11 = cohorts 4..10, all 7 primary: full fill target.
+                _gias_row(1, age_range="4–11", pupils=210),
+                # 11-16 = cohorts 11..15, all 5 secondary.
+                _gias_row(2, age_range="11–16", pupils=500),
+                # 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
+                # gets 7 of 7.5 cohort weights.
+                _gias_row(3, age_range="3–11", pupils=240),
+                # All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
+                _gias_row(4, age_range="4–16", pupils=1200),
+                # 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
+                # secondary gets 5 of 6.2 cohort weights.
+                _gias_row(5, age_range="11–18", pupils=1240),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
+    assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
+
+
+def test_phase_intakes_excludes_non_state_and_selective_schools():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                _gias_row(1, type_group="Independent schools"),
+                _gias_row(2, type_group="Special schools"),
+                _gias_row(3, type_group="Welsh schools"),
+                # Grammar school intakes are test-based and region-wide; a
+                # distance catchment would be fabricated.
+                _gias_row(4, admissions_policy="Selective"),
+                _gias_row(5, pupils=None, capacity=300),
+                _gias_row(6, pupils=None, capacity=None),  # no usable headcount
+                _gias_row(7, age_range=None),  # no parsable cohorts
+                # Over-full school keeps its demonstrated size.
+                _gias_row(8, pupils=350, capacity=300),
+                _gias_row(9, admissions_policy="Non-selective"),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["urn"].to_list() == [5, 8, 9]
+    assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
+
+
+def test_school_preference_bonuses_follow_derived_grade():
+    rows = [
+        {**_school("Primary", "1", None, 1)},
+        {**_school("Primary", "2", None, 2)},
+        {**_school("Primary", "3", None, 3)},
+        {**_school("Primary", "4", None, 4)},
+        {**_school("Primary", None, "Some aspects not as strong", 5)},  # unrated
+        {**_school("Primary", "Not judged", "School remains Good", 6)},
+    ]
+    bonuses = dict(
+        school_preference_bonuses(
+            pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
+        ).iter_rows()
+    )
+    assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
+
+
+def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
+    postcodes = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
+            "lat": [51.5, 51.5, 52.0],
+            "lng": [-0.1, -0.1, -0.2],
+            "lsoa21cd": ["E01000001", "E01000001", "E01000002"],
+        }
+    )
+    lsoa_children = pl.DataFrame(
+        {
+            "lsoa21": ["E01000001", "E01000002"],
+            "aged_0_4": [100, 30],
+            "aged_5_9": [100, 10],
+            "aged_10_14": [100, 20],
+            "aged_15_19": [100, 40],
+        }
+    )
+    result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
+    # Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
+    # the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
+    assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
+    # Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
+    assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
+
+
+def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
+    # One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
+    # each. The two nearest postcodes exactly fill it, so the cutoff is the
+    # marginal admitted child's distance and the 3km postcode is shut out.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0]]),
+        np.array([10.0]),
+        np.array([0.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs.tolist() == [2.0]
+
+
+def test_equilibrium_rejected_demand_cascades_to_next_school():
+    # School A (5 places) at the origin, school B (5 places) at 10km.
+    # P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
+    # with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
+    # exceeds its target, so it keeps no binding cutoff.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [10.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0], [1.5, 0.0]]),
+        np.array([5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_preference_bonus_steers_demand_to_better_school():
+    # Two schools equidistant from the only postcode; school A is rated
+    # better (0.5km bonus) so all children choose it; B attracts nobody.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.5, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_logit_choice_smears_demand_across_schools():
+    # With a positive temperature some families prefer the further school, so
+    # both schools receive applications: the near school still fills and keeps
+    # a binding cutoff, and the far school now attracts mass it would never
+    # see under deterministic choice.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([4.0, 4.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=1.0,
+    )
+    # Each school gets half the 10 children (equidistant, equal utility),
+    # exceeding both fill targets: both cutoffs bind at the postcode.
+    assert cutoffs.tolist() == [1.0, 1.0]
+
+
+def test_capacity_fill_radii_covers_fill_target_population():
+    # Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
+    # cumulate past the target at 2km. A school needing more children than
+    # exist within the cap keeps the cap.
+    radii = capacity_fill_radii(
+        np.array([[0.0, 0.0], [0.0, 0.0]]),
+        np.array([6.0, 1000.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        max_radius_km=25.0,
+    )
+    assert radii.tolist() == [2.0, 25.0]
+
+
+def test_count_covering_catchments_respects_radius_and_validity():
+    pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
+    pc_valid = np.array([True, True, True, False])
+    school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
+    radii = np.array([4.0, 1.5])
+    counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
+    # pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
+    # pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
+    assert counts.tolist() == [1, 2, 0, 0]
+
+
+def test_count_covering_catchments_empty_schools():
+    counts = count_covering_catchments(
+        np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
+    )
+    assert counts.tolist() == [0, 0]