perfect-postcode/pipeline/transform/test_school_catchments.py

import numpy as np
import polars as pl

from pipeline.transform.school_catchments import (
    capacity_fill_radii,
    children_per_postcode,
    classify_good_plus_schools,
    count_covering_catchments,
    equilibrium_cutoffs,
    phase_intakes,
    school_preference_bonuses,
)


def _school(phase, oeif, ungraded, urn=100000):
    return {
        "URN": urn,
        "Postcode": "AA1 1AA",
        "Ofsted phase": phase,
        "Latest OEIF overall effectiveness": oeif,
        "Ungraded inspection overall outcome": ungraded,
    }


def _classify(rows):
    result = classify_good_plus_schools(pl.DataFrame(rows))
    return {(r["urn"], r["category"]) for r in result.to_dicts()}


def test_legacy_oeif_grades_1_and_2_are_kept():
    rows = [
        _school("Primary", "1", None, 1),
        _school("Primary", "2", None, 2),
        _school("Secondary", "1", None, 3),
        _school("Secondary", "2", None, 4),
    ]
    assert _classify(rows) == {
        (1, "outstanding_primary"),
        (2, "good_primary"),
        (3, "outstanding_secondary"),
        (4, "good_secondary"),
    }


def test_grades_3_and_4_are_excluded():
    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
    assert _classify(rows) == set()


def test_ungraded_remains_good_is_recovered_when_no_graded_result():
    # Null and "Not judged" OEIF fall back to the ungraded outcome.
    rows = [
        _school("Primary", None, "School remains Good", 1),
        _school("Secondary", "Not judged", "School remains Outstanding", 2),
        # "(Improving)" is still good+ ...
        _school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
    ]
    assert _classify(rows) == {
        (1, "good_primary"),
        (2, "outstanding_secondary"),
        (3, "good_primary"),
    }


def test_ungraded_concerns_are_not_good_plus():
    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
    # must NOT be counted as good+ schools.
    rows = [
        _school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
        _school(
            "Secondary",
            None,
            "School remains Outstanding (Concerns) - S5 Next",
            2,
        ),
    ]
    assert _classify(rows) == set()


def test_ungraded_non_good_outcomes_are_excluded():
    rows = [
        _school("Primary", None, "Some aspects not as strong"),
        _school("Primary", None, "Standards maintained"),
        _school("Primary", None, None),
    ]
    assert _classify(rows) == set()


def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
    # A real grade 3 must not be promoted by an ungraded "remains Good".
    rows = [_school("Primary", "3", "School remains Good")]
    assert _classify(rows) == set()


def test_non_primary_secondary_phases_excluded():
    rows = [
        _school("Nursery", "1", None),
        _school("Not applicable", "2", None),
    ]
    assert _classify(rows) == set()


def _aged_school(phase, oeif, low, high, urn=100000):
    return {
        "URN": urn,
        "Postcode": "AA1 1AA",
        "Ofsted phase": phase,
        "Latest OEIF overall effectiveness": oeif,
        "Ungraded inspection overall outcome": None,
        "Statutory lowest age": low,
        "Statutory highest age": high,
    }


def test_all_through_school_counts_toward_both_primary_and_secondary():
    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
    # serves primary-age children too, so it must count in BOTH metrics.
    rows = [_aged_school("Secondary", "2", 3, 18, 1)]
    assert _classify(rows) == {
        (1, "good_primary"),
        (1, "good_secondary"),
    }


def test_age_ranges_assign_single_phase_for_standard_schools():
    rows = [
        _aged_school("Primary", "1", 4, 11, 1),  # primary only
        _aged_school("Secondary", "2", 11, 16, 2),  # secondary only
        _aged_school("Secondary", "1", 9, 13, 3),  # middle -> both
    ]
    assert _classify(rows) == {
        (1, "outstanding_primary"),
        (2, "good_secondary"),
        (3, "outstanding_primary"),
        (3, "outstanding_secondary"),
    }


def test_closed_schools_excluded_when_open_register_given():
    rows = [
        _aged_school("Primary", "1", 4, 11, 111),
        _aged_school("Secondary", "2", 11, 16, 222),
    ]
    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
    pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
    # URN 222 is not in the open register, so it is dropped.
    assert pairs == {(111, "outstanding_primary")}


def _gias_row(
    urn,
    type_group="Academies",
    age_range="4–11",
    pupils=210,
    capacity=None,
    admissions_policy=None,
):
    return {
        "urn": urn,
        "name": f"School {urn}",
        "lat": 51.5,
        "lng": -0.1,
        "type_group": type_group,
        "age_range": age_range,
        "pupils": pupils,
        "capacity": capacity,
        "admissions_policy": admissions_policy,
    }


def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
    intakes = phase_intakes(
        pl.DataFrame(
            [
                # 4-11 = cohorts 4..10, all 7 primary: full fill target.
                _gias_row(1, age_range="4–11", pupils=210),
                # 11-16 = cohorts 11..15, all 5 secondary.
                _gias_row(2, age_range="11–16", pupils=500),
                # 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
                # gets 7 of 7.5 cohort weights.
                _gias_row(3, age_range="3–11", pupils=240),
                # All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
                _gias_row(4, age_range="4–16", pupils=1200),
                # 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
                # secondary gets 5 of 6.2 cohort weights.
                _gias_row(5, age_range="11–18", pupils=1240),
            ]
        )
    ).sort("urn")
    assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
    assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]


def test_phase_intakes_parses_one_sided_age_ranges():
    """gias._format_age_range emits "up to {high}" and "{low}+" when a
    statutory age is missing; those schools must stay in the catchment supply
    instead of being silently dropped by a two-number parse."""
    intakes = phase_intakes(
        pl.DataFrame(
            [
                # "up to 11" = assumed cohorts 2..10: nursery years 2-3 weigh
                # 0.5 each, primary 4..10 weighs 7 -> primary 210 * 7/8.
                _gias_row(1, age_range="up to 11", pupils=210),
                # "16+" = assumed cohorts 16..18, all sixth form: no
                # primary/secondary intake, so the school contributes nothing
                # but must not crash the parse.
                _gias_row(2, age_range="16+", pupils=400),
            ]
        )
    ).sort("urn")
    assert intakes["urn"].to_list() == [1, 2]
    assert intakes["primary_intake"].to_list() == [210.0 * 7 / 8, 0.0]
    assert intakes["secondary_intake"].to_list() == [0.0, 0.0]


def test_phase_intakes_excludes_non_state_and_selective_schools():
    intakes = phase_intakes(
        pl.DataFrame(
            [
                _gias_row(1, type_group="Independent schools"),
                _gias_row(2, type_group="Special schools"),
                _gias_row(3, type_group="Welsh schools"),
                # Grammar school intakes are test-based and region-wide; a
                # distance catchment would be fabricated.
                _gias_row(4, admissions_policy="Selective"),
                _gias_row(5, pupils=None, capacity=300),
                _gias_row(6, pupils=None, capacity=None),  # no usable headcount
                _gias_row(7, age_range=None),  # no parsable cohorts
                # Over-full school keeps its demonstrated size.
                _gias_row(8, pupils=350, capacity=300),
                _gias_row(9, admissions_policy="Non-selective"),
            ]
        )
    ).sort("urn")
    assert intakes["urn"].to_list() == [5, 8, 9]
    assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]


def test_school_preference_bonuses_follow_derived_grade():
    rows = [
        {**_school("Primary", "1", None, 1)},
        {**_school("Primary", "2", None, 2)},
        {**_school("Primary", "3", None, 3)},
        {**_school("Primary", "4", None, 4)},
        {**_school("Primary", None, "Some aspects not as strong", 5)},  # unrated
        {**_school("Primary", "Not judged", "School remains Good", 6)},
    ]
    bonuses = dict(
        school_preference_bonuses(
            pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
        ).iter_rows()
    )
    assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}


def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
    postcodes = pl.DataFrame(
        {
            "postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
            "lat": [51.5, 51.5, 52.0],
            "lng": [-0.1, -0.1, -0.2],
            "lsoa21cd": ["E01000001", "E01000001", "E01000002"],
        }
    )
    lsoa_children = pl.DataFrame(
        {
            "lsoa21": ["E01000001", "E01000002"],
            "aged_0_4": [100, 30],
            "aged_5_9": [100, 10],
            "aged_10_14": [100, 20],
            "aged_15_19": [100, 40],
        }
    )
    result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
    # Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
    # the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
    assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
    # Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
    assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]


def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
    # One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
    # each. The two nearest postcodes exactly fill it, so the cutoff is the
    # marginal admitted child's distance and the 3km postcode is shut out.
    cutoffs = equilibrium_cutoffs(
        np.array([[0.0, 0.0]]),
        np.array([10.0]),
        np.array([0.0]),
        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
        np.array([5.0, 5.0, 5.0]),
        tau_km=0.0,
    )
    assert cutoffs.tolist() == [2.0]


def test_equilibrium_rejected_demand_cascades_to_next_school():
    # School A (5 places) at the origin, school B (5 places) at 10km.
    # P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
    # with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
    # exceeds its target, so it keeps no binding cutoff.
    cutoffs = equilibrium_cutoffs(
        np.array([[0.0, 0.0], [10.0, 0.0]]),
        np.array([5.0, 5.0]),
        np.array([0.0, 0.0]),
        np.array([[1.0, 0.0], [1.5, 0.0]]),
        np.array([5.0, 5.0]),
        tau_km=0.0,
    )
    assert cutoffs[0] == 1.0
    assert np.isinf(cutoffs[1])


def test_equilibrium_preference_bonus_steers_demand_to_better_school():
    # Two schools equidistant from the only postcode; school A is rated
    # better (0.5km bonus) so all children choose it; B attracts nobody.
    cutoffs = equilibrium_cutoffs(
        np.array([[0.0, 0.0], [2.0, 0.0]]),
        np.array([5.0, 5.0]),
        np.array([0.5, 0.0]),
        np.array([[1.0, 0.0]]),
        np.array([10.0]),
        tau_km=0.0,
    )
    assert cutoffs[0] == 1.0
    assert np.isinf(cutoffs[1])


def test_equilibrium_logit_choice_smears_demand_across_schools():
    # With a positive temperature some families prefer the further school, so
    # both schools receive applications: the near school still fills and keeps
    # a binding cutoff, and the far school now attracts mass it would never
    # see under deterministic choice.
    cutoffs = equilibrium_cutoffs(
        np.array([[0.0, 0.0], [2.0, 0.0]]),
        np.array([4.0, 4.0]),
        np.array([0.0, 0.0]),
        np.array([[1.0, 0.0]]),
        np.array([10.0]),
        tau_km=1.0,
    )
    # Each school gets half the 10 children (equidistant, equal utility),
    # exceeding both fill targets: both cutoffs bind at the postcode.
    assert cutoffs.tolist() == [1.0, 1.0]


def test_capacity_fill_radii_covers_fill_target_population():
    # Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
    # cumulate past the target at 2km. A school needing more children than
    # exist within the cap keeps the cap.
    radii = capacity_fill_radii(
        np.array([[0.0, 0.0], [0.0, 0.0]]),
        np.array([6.0, 1000.0]),
        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
        np.array([5.0, 5.0, 5.0]),
        max_radius_km=25.0,
    )
    assert radii.tolist() == [2.0, 25.0]


def test_count_covering_catchments_respects_radius_and_validity():
    pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
    pc_valid = np.array([True, True, True, False])
    school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
    radii = np.array([4.0, 1.5])
    counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
    # pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
    # pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
    assert counts.tolist() == [1, 2, 0, 0]


def test_count_covering_catchments_empty_schools():
    counts = count_covering_catchments(
        np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
    )
    assert counts.tolist() == [0, 0]