perfect-postcode/pipeline/transform/test_school_catchments.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

376 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import polars as pl
from pipeline.transform.school_catchments import (
capacity_fill_radii,
children_per_postcode,
classify_good_plus_schools,
count_covering_catchments,
equilibrium_cutoffs,
phase_intakes,
school_preference_bonuses,
)
def _school(phase, oeif, ungraded, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["urn"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, 1),
_school("Primary", "2", None, 2),
_school("Secondary", "1", None, 3),
_school("Secondary", "2", None, 4),
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_primary"),
(3, "outstanding_secondary"),
(4, "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", 1),
_school("Secondary", "Not judged", "School remains Outstanding", 2),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
]
assert _classify(rows) == {
(1, "good_primary"),
(2, "outstanding_secondary"),
(3, "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
2,
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, 1)]
assert _classify(rows) == {
(1, "good_primary"),
(1, "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, 1), # primary only
_aged_school("Secondary", "2", 11, 16, 2), # secondary only
_aged_school("Secondary", "1", 9, 13, 3), # middle -> both
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_secondary"),
(3, "outstanding_primary"),
(3, "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, 111),
_aged_school("Secondary", "2", 11, 16, 222),
]
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {(111, "outstanding_primary")}
def _gias_row(
urn,
type_group="Academies",
age_range="411",
pupils=210,
capacity=None,
admissions_policy=None,
):
return {
"urn": urn,
"name": f"School {urn}",
"lat": 51.5,
"lng": -0.1,
"type_group": type_group,
"age_range": age_range,
"pupils": pupils,
"capacity": capacity,
"admissions_policy": admissions_policy,
}
def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
intakes = phase_intakes(
pl.DataFrame(
[
# 4-11 = cohorts 4..10, all 7 primary: full fill target.
_gias_row(1, age_range="411", pupils=210),
# 11-16 = cohorts 11..15, all 5 secondary.
_gias_row(2, age_range="1116", pupils=500),
# 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
# gets 7 of 7.5 cohort weights.
_gias_row(3, age_range="311", pupils=240),
# All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
_gias_row(4, age_range="416", pupils=1200),
# 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
# secondary gets 5 of 6.2 cohort weights.
_gias_row(5, age_range="1118", pupils=1240),
]
)
).sort("urn")
assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
def test_phase_intakes_parses_one_sided_age_ranges():
"""gias._format_age_range emits "up to {high}" and "{low}+" when a
statutory age is missing; those schools must stay in the catchment supply
instead of being silently dropped by a two-number parse."""
intakes = phase_intakes(
pl.DataFrame(
[
# "up to 11" = assumed cohorts 2..10: nursery years 2-3 weigh
# 0.5 each, primary 4..10 weighs 7 -> primary 210 * 7/8.
_gias_row(1, age_range="up to 11", pupils=210),
# "16+" = assumed cohorts 16..18, all sixth form: no
# primary/secondary intake, so the school contributes nothing
# but must not crash the parse.
_gias_row(2, age_range="16+", pupils=400),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [1, 2]
assert intakes["primary_intake"].to_list() == [210.0 * 7 / 8, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 0.0]
def test_phase_intakes_excludes_non_state_and_selective_schools():
intakes = phase_intakes(
pl.DataFrame(
[
_gias_row(1, type_group="Independent schools"),
_gias_row(2, type_group="Special schools"),
_gias_row(3, type_group="Welsh schools"),
# Grammar school intakes are test-based and region-wide; a
# distance catchment would be fabricated.
_gias_row(4, admissions_policy="Selective"),
_gias_row(5, pupils=None, capacity=300),
_gias_row(6, pupils=None, capacity=None), # no usable headcount
_gias_row(7, age_range=None), # no parsable cohorts
# Over-full school keeps its demonstrated size.
_gias_row(8, pupils=350, capacity=300),
_gias_row(9, admissions_policy="Non-selective"),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [5, 8, 9]
assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
def test_school_preference_bonuses_follow_derived_grade():
rows = [
{**_school("Primary", "1", None, 1)},
{**_school("Primary", "2", None, 2)},
{**_school("Primary", "3", None, 3)},
{**_school("Primary", "4", None, 4)},
{**_school("Primary", None, "Some aspects not as strong", 5)}, # unrated
{**_school("Primary", "Not judged", "School remains Good", 6)},
]
bonuses = dict(
school_preference_bonuses(
pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
).iter_rows()
)
assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
"lat": [51.5, 51.5, 52.0],
"lng": [-0.1, -0.1, -0.2],
"lsoa21cd": ["E01000001", "E01000001", "E01000002"],
}
)
lsoa_children = pl.DataFrame(
{
"lsoa21": ["E01000001", "E01000002"],
"aged_0_4": [100, 30],
"aged_5_9": [100, 10],
"aged_10_14": [100, 20],
"aged_15_19": [100, 40],
}
)
result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
# Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
# the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
# Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
# One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
# each. The two nearest postcodes exactly fill it, so the cutoff is the
# marginal admitted child's distance and the 3km postcode is shut out.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0]]),
np.array([10.0]),
np.array([0.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs.tolist() == [2.0]
def test_equilibrium_rejected_demand_cascades_to_next_school():
# School A (5 places) at the origin, school B (5 places) at 10km.
# P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
# with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
# exceeds its target, so it keeps no binding cutoff.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [10.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0], [1.5, 0.0]]),
np.array([5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_preference_bonus_steers_demand_to_better_school():
# Two schools equidistant from the only postcode; school A is rated
# better (0.5km bonus) so all children choose it; B attracts nobody.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.5, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_logit_choice_smears_demand_across_schools():
# With a positive temperature some families prefer the further school, so
# both schools receive applications: the near school still fills and keeps
# a binding cutoff, and the far school now attracts mass it would never
# see under deterministic choice.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([4.0, 4.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=1.0,
)
# Each school gets half the 10 children (equidistant, equal utility),
# exceeding both fill targets: both cutoffs bind at the postcode.
assert cutoffs.tolist() == [1.0, 1.0]
def test_capacity_fill_radii_covers_fill_target_population():
# Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
# cumulate past the target at 2km. A school needing more children than
# exist within the cap keeps the cap.
radii = capacity_fill_radii(
np.array([[0.0, 0.0], [0.0, 0.0]]),
np.array([6.0, 1000.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
max_radius_km=25.0,
)
assert radii.tolist() == [2.0, 25.0]
def test_count_covering_catchments_respects_radius_and_validity():
pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
pc_valid = np.array([True, True, True, False])
school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
radii = np.array([4.0, 1.5])
counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
# pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
# pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
assert counts.tolist() == [1, 2, 0, 0]
def test_count_covering_catchments_empty_schools():
counts = count_covering_catchments(
np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
)
assert counts.tolist() == [0, 0]