Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
    return _clean_string(column).cast(dtype, strict=False)


+def _join_address_parts(*columns: str) -> pl.Expr:
+    """Join address components into one display address, single-spaced.
+
+    Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
+    saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
+    skips only nulls, so empty components still contributed their separator
+    (``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
+    Convert ``''``→null per component so ignore_nulls works as intended, then
+    defensively collapse residual whitespace runs and strip the result. A
+    fully-empty address becomes null (dropped by the downstream
+    ``pp_address.is_not_null()`` filter) instead of whitespace junk.
+    """
+    joined = pl.concat_str(
+        [_clean_string(column) for column in columns],
+        separator=" ",
+        ignore_nulls=True,
+    )
+    cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
+    return pl.when(cleaned == "").then(None).otherwise(cleaned)
+
+
 def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
    return (
        raw.select(
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        )
        .filter(pl.col("pp_property_type") != "Other")
        .with_columns(
-            pl.concat_str(
-                [pl.col("saon"), pl.col("paon"), pl.col("street")],
-                separator=" ",
-                ignore_nulls=True,
-            ).alias("pp_address"),
+            _join_address_parts("saon", "paon", "street").alias("pp_address"),
        )
        .with_columns(
            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -102,15 +102,11 @@ _AREA_COLUMNS = [
    # is postcode-grain: it belongs in the area output (one value per postcode,
    # covering property-less postcodes too) rather than duplicated per property.
    TREE_DENSITY_FEATURE,
-    # Schools
-    "Good+ primary schools within 5km",
-    "Good+ secondary schools within 5km",
-    "Good+ primary schools within 2km",
-    "Good+ secondary schools within 2km",
-    "Outstanding primary schools within 5km",
-    "Outstanding secondary schools within 5km",
-    "Outstanding primary schools within 2km",
-    "Outstanding secondary schools within 2km",
+    # Schools (modelled historical catchment areas covering the postcode)
+    "Good+ primary school catchments",
+    "Good+ secondary school catchments",
+    "Outstanding primary school catchments",
+    "Outstanding secondary school catchments",
    # Demographics
    "Median age",
    # Politics
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
    "latest_price": "Last known price",
    "number_habitable_rooms": "Number of bedrooms & living rooms",
    "noise_lden_db": "Noise (dB)",
-    "good_primary_5km": "Good+ primary schools within 5km",
-    "good_secondary_5km": "Good+ secondary schools within 5km",
-    "good_primary_2km": "Good+ primary schools within 2km",
-    "good_secondary_2km": "Good+ secondary schools within 2km",
-    "outstanding_primary_5km": "Outstanding primary schools within 5km",
-    "outstanding_secondary_5km": "Outstanding secondary schools within 5km",
-    "outstanding_primary_2km": "Outstanding primary schools within 2km",
-    "outstanding_secondary_2km": "Outstanding secondary schools within 2km",
+    "good_primary_catchments": "Good+ primary school catchments",
+    "good_secondary_catchments": "Good+ secondary school catchments",
+    "outstanding_primary_catchments": "Outstanding primary school catchments",
+    "outstanding_secondary_catchments": "Outstanding secondary school catchments",
    "max_download_speed": "Max available download speed (Mbps)",
    "serious_crime_avg_yr": "Serious crime (avg/yr)",
    "minor_crime_avg_yr": "Minor crime (avg/yr)",
@ -874,7 +866,7 @@ def _join_area_side_tables(
    election: pl.LazyFrame,
    poi_counts: pl.LazyFrame,
    noise: pl.LazyFrame,
-    school_proximity: pl.LazyFrame,
+    school_catchments: pl.LazyFrame,
    conservation_areas: pl.LazyFrame,
    tree_density: pl.LazyFrame | None,
    broadband: pl.LazyFrame,
@ -905,7 +897,7 @@ def _join_area_side_tables(
    base = base.join(election, on="pcon", how="left")
    base = base.join(poi_counts, on="postcode", how="left")
    base = base.join(noise, on="postcode", how="left")
-    base = base.join(school_proximity, on="postcode", how="left")
+    base = base.join(school_catchments, on="postcode", how="left")
    base = base.join(conservation_areas, on="postcode", how="left").with_columns(
        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
    )
@ -1970,7 +1962,7 @@ def _build(
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
-    school_proximity_path: Path,
+    school_catchments_path: Path,
    broadband_path: Path,
    conservation_areas_path: Path,
    rental_prices_path: Path,
@ -2080,7 +2072,7 @@ def _build(
        )
        .select("postcode", "noise_lden_db")
    )
-    school_proximity = pl.scan_parquet(school_proximity_path)
+    school_catchments = pl.scan_parquet(school_catchments_path)
    conservation_areas = _conservation_area_by_postcode(
        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
    )
@ -2120,7 +2112,7 @@ def _build(
        "election": election,
        "poi_counts": poi_counts,
        "noise": noise,
-        "school_proximity": school_proximity,
+        "school_catchments": school_catchments,
        "conservation_areas": conservation_areas,
        "tree_density": tree_density,
        "broadband": broadband,
@ -2267,10 +2259,10 @@ def main():
        "--noise", type=Path, required=True, help="Road noise by postcode parquet file"
    )
    parser.add_argument(
-        "--school-proximity",
+        "--school-catchments",
        type=Path,
        required=True,
-        help="School proximity counts parquet file",
+        help="School catchment counts parquet file",
    )
    parser.add_argument(
        "--broadband",
@ -2376,7 +2368,7 @@ def main():
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,
-        school_proximity_path=args.school_proximity,
+        school_catchments_path=args.school_catchments,
        broadband_path=args.broadband,
        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
 # Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
 GROCERIES_GROUP = "Groceries"

+# Groceries categories EXCLUDED from the static "Number of grocery shops and
+# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
+# are speciality food retail, not somewhere you do a grocery shop; together
+# they were ~a third of the group and inflated the headline count. The metric
+# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
+GROCERY_STATIC_EXCLUDED_CATEGORIES = {
+    "Bakery",
+    "Butcher & Fishmonger",
+    "Deli & Specialty",
+    "Off-Licence",
+}
+
 # OS Open Greenspace function types used for park counts and distance calculation.
 # Uses the authoritative OS dataset instead of OSM point POIs for better coverage
 # of green spaces that are only mapped as polygons in OSM.
+# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
+# (open public recreation grounds) is borderline but kept: outside big cities
+# the local rec ground is the de facto park. "Play Space" (playgrounds) is
+# excluded — a playground is not a park, and "Playground" is already its own
+# OSM-derived category. The remaining functions (Religious Grounds, Golf
+# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
+# Facility) are clearly not parks.
 GREENSPACE_PARK_FUNCTIONS = {
-    "parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
+    "parks": ["Public Park Or Garden", "Playing Field"],
 }

 GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:


 def _groceries_categories(pois: pl.DataFrame) -> list[str]:
-    """Return the distinct `category` values for the Groceries group.
+    """Return the distinct `category` values for the static groceries metric.

    `count_pois_per_postcode` matches POIs on `category`, but the authoritative
    GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
    with group "Groceries"; it never emits the literal "Supermarket". Collecting
    every Groceries category captures both the OSM strings and the brand names.
+    Speciality food retail (bakeries, butchers, delis, off-licences) is
+    excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES.
    """
    if "group" not in pois.columns:
        raise ValueError("POI dataframe must include a 'group' column")
    return (
-        pois.filter(pl.col("group") == GROCERIES_GROUP)
+        pois.filter(
+            (pl.col("group") == GROCERIES_GROUP)
+            & ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
+        )
        .select("category")
        .unique()
        .sort("category")
@ -109,6 +133,40 @@ def _build_poi_category_groups(
    return groups, display_names


+def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
+    """Collapse the greenspace frame to ONE representative row per site.
+
+    os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
+    right grain for nearest-distance (the nearest gate is what matters) but
+    wildly over-counts "Number of amenities (Park) within Xkm" — a large park
+    with 30 gates counted as 30 parks. Counting uses one row per site at the
+    site centroid (falling back to the first access point when no centroid is
+    available). Degrades gracefully: a legacy parquet without `site_id` is
+    returned unchanged (gate-grain counts) rather than crashing.
+    """
+    if "site_id" not in greenspace.columns:
+        print(
+            "WARNING: greenspace parquet has no site_id column; park counts "
+            "will count access points, not sites (regenerate os_greenspace)"
+        )
+        return greenspace
+
+    keyed = greenspace.filter(pl.col("site_id").is_not_null())
+    unkeyed = greenspace.filter(pl.col("site_id").is_null())
+
+    representatives = keyed.unique(subset=["site_id"], keep="first")
+    if {"site_lat", "site_lng"}.issubset(greenspace.columns):
+        representatives = representatives.with_columns(
+            pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
+            pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
+        )
+
+    frames = [representatives.select(greenspace.columns)]
+    if len(unkeyed) > 0:
+        frames.append(unkeyed)
+    return pl.concat(frames)
+
+
 def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
    renames: dict[str, str] = {}
    for group_key, category in display_names.items():
@ -185,13 +243,16 @@ def main():

    # Park counts and distances from OS Open Greenspace. They use the dynamic
    # amenity metric names so filters read through the same side-table path as
-    # OSM-derived amenity metrics.
+    # OSM-derived amenity metrics. Distances use the access-point grain (the
+    # nearest park GATE is the right semantics); counts use one row per SITE so
+    # a park with many gates counts once.
    greenspace = pl.read_parquet(args.greenspace)
+    greenspace_sites = _greenspace_count_frame(greenspace)
    park_counts_2km = count_pois_per_postcode(
-        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
+        postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
    )
    park_counts_5km = count_pois_per_postcode(
-        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
+        postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
    )
    park_distances = min_distance_per_postcode(
        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -260,6 +260,12 @@ def main() -> None:
    )
    args = parser.parse_args()

+    if args.greenspace and not args.greenspace.exists():
+        # Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
+        # the subtraction is exactly how parks/lakes shipped inside postcode
+        # boundaries unnoticed.
+        raise SystemExit(f"--greenspace file not found: {args.greenspace}")
+
    fragments_cache = args.output / "fragments_cache.parquet"
    # Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
    # so a greenspace change must not invalidate the fragment cache.
@ -294,7 +300,7 @@ def main() -> None:

    greenspace_tree = None
    greenspace_geoms = None
-    if args.greenspace and args.greenspace.exists():
+    if args.greenspace:
        from .greenspace import load_greenspace

        print(f"  Loading greenspace/water from {args.greenspace}...")
--- a/pipeline/transform/postcode_boundaries/greenspace.py
+++ b/pipeline/transform/postcode_boundaries/greenspace.py
@ -3,7 +3,7 @@
 from pathlib import Path

 import polars as pl
-from shapely import wkb
+from shapely import make_valid, wkb
 from shapely.geometry import MultiPolygon, Polygon
 from shapely.strtree import STRtree

@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
 def load_greenspace(path: Path) -> tuple[STRtree, list]:
    """Load greenspace parquet and build an STRtree spatial index.

+    Geometries are repaired with ``make_valid`` on load: an invalid park/lake
+    polygon would make the per-postcode ``intersects`` predicate (and the exact
+    difference path) liable to raise mid-merge, hours into a build. Empty
+    geometries are dropped.
+
    Returns:
        (tree, geoms) where tree is a Shapely STRtree and geoms is
        the list of geometries indexed by the tree.
    """
    df = pl.read_parquet(path)
-    geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
+    geoms = []
+    for raw in df["geometry"].to_list():
+        geom = wkb.loads(raw)
+        if not geom.is_valid:
+            geom = make_valid(geom)
+        if not geom.is_empty:
+            geoms.append(geom)
    tree = STRtree(geoms)
    return tree, geoms

--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
    return geojson_dict


+def _is_pointlike(geom_bng) -> bool:
+    """True if a BNG geometry carries no real extent (tower-block signature).
+
+    Near-zero area AND short perimeter together distinguish a collapsed point
+    from a genuine thin sliver, which still carries length.
+    """
+    try:
+        return (
+            geom_bng.area < _POINTLIKE_AREA_M2
+            and geom_bng.length < _POINTLIKE_PERIMETER_M
+        )
+    except GEOSException:
+        return False
+
+
 def _rescue_footprint(geom_bng) -> dict | None:
    """Fatten a degenerate BNG geometry into a representable footprint and snap.

@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
    gets a building-scale buffer so it is not reduced to an invisible sub-metre
    dot; thin slivers that still carry length keep the minimal buffer.
    """
-    buffer_m = _MIN_FOOTPRINT_BUFFER_M
-    try:
-        if (
-            geom_bng.area < _POINTLIKE_AREA_M2
-            and geom_bng.length < _POINTLIKE_PERIMETER_M
-        ):
-            buffer_m = _POINT_RESCUE_BUFFER_M
-    except GEOSException:
-        pass
+    buffer_m = (
+        _POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
+    )
    footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
    if footprint is None:
        return None
@ -147,10 +156,16 @@ def to_wgs84_geojson(
        )
        if simplified is None:
            simplified = cleaned
-        # Normal path; if snapping erases a thin sliver, fatten its real shape.
-        result = _snap_to_wgs84_geojson(simplified)
-        if result is None:
+        if _is_pointlike(simplified):
+            # A POINTLIKE footprint is rescued to building scale even when it
+            # would survive snapping: a 0.1-1 m² polygon serializes fine but
+            # ships as an invisible dot covering a whole tower block.
            result = _rescue_footprint(simplified)
+        else:
+            # Normal path; if snapping erases a thin sliver, fatten its real shape.
+            result = _snap_to_wgs84_geojson(simplified)
+            if result is None:
+                result = _rescue_footprint(simplified)
        if result is not None:
            return result

@ -229,6 +244,10 @@ def merge_fragments(
        greenspace_tree: Optional STRtree of park/water polygons.
        greenspace_geoms: Optional list of park/water geometries (indexed by tree).
    """
+    subtract = greenspace_tree is not None and greenspace_geoms is not None
+    if subtract:
+        from .greenspace import subtract_greenspace
+
    by_postcode: dict[str, list] = defaultdict(list)
    for pc, geom in all_fragments:
        by_postcode[pc].append(geom)
@ -256,9 +275,7 @@ def merge_fragments(
        # Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
        combined = _fill_holes(combined)
        # Subtract parks/water if provided
-        if greenspace_tree is not None and greenspace_geoms is not None:
-            from .greenspace import subtract_greenspace
-
+        if subtract:
            pre_green = combined
            combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
            combined = _keep_polygon_parts(combined)
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -921,6 +921,49 @@ class TestToWgs84Geojson:
        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
        assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"

+    def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
+        """A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
+        0.86 m²) must NOT ship as-is just because it survives precision snapping;
+        pointlike inputs are rescued to a ~201 m² disc unconditionally."""
+        import pyproj
+        from shapely.geometry import shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        # 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
+        # large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
+        tiny = box(530000, 180000, 530000.9, 180000.9)
+        from .output import _snap_to_wgs84_geojson
+
+        assert _snap_to_wgs84_geojson(tiny) is not None, (
+            "precondition: this polygon must be snappable, otherwise the test "
+            "exercises the old snap-fails path instead of the new one"
+        )
+        result = to_wgs84_geojson(tiny)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert 150 < area_m2 < 300, (
+            f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
+            "instead of a building-scale (~201 m^2) disc"
+        )
+
+    def test_normal_polygon_area_unchanged(self):
+        """A normal polygon must pass through without rescue inflation."""
+        import pyproj
+        from shapely.geometry import shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        poly = box(530000, 180000, 530100, 180100)  # 10,000 m²
+        result = to_wgs84_geojson(poly)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 == pytest.approx(10_000, rel=0.01)
+
    def test_thin_sliver_keeps_minimal_buffer(self):
        """A genuine elongated sliver still carries length, so it is NOT inflated
        to building scale — only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
        # 80% < 90% cap, so subtraction should happen
        assert result.area == pytest.approx(2000, rel=0.01)

+    def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
+        """An invalid (bow-tie) park polygon in the parquet must be repaired on
+        load: it would otherwise make the per-postcode intersects/difference
+        liable to raise hours into a merge."""
+        from .greenspace import load_greenspace
+
+        bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)])  # self-intersects
+        assert not bowtie.is_valid
+        valid = box(20, 20, 30, 30)
+        path = tmp_path / "greenspace.parquet"
+        pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
+
+        tree, geoms = load_greenspace(path)
+        assert len(geoms) == 2
+        assert all(g.is_valid and not g.is_empty for g in geoms)
+        # The repaired bow-tie must still subtract cleanly.
+        result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
+        assert result.is_valid
+        assert result.area < 10_000
+

 class TestToWgs84GeojsonValidity:
    """to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -26,6 +26,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
    LATEST_COMPLETE_YEAR,
+    SMOOTHNESS_SUPPORT_PAIRS,
    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
@ -37,6 +38,19 @@ from pipeline.transform.price_estimation.utils import (

 MIN_PAIRS = 5
 OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
+# Gap-aware companion to OUTLIER_THRESHOLD: |log_ratio| must also stay within
+# this many log-units PER YEAR of holding period (short gaps are allowed a
+# full year's band). A flat +/-3.0 cap admits e.g. a 10k -> 196k "sale" six
+# months apart (log +2.95, and weight 1/sqrt(gap) gives it the leverage of
+# ~10 normal pairs); Huber does NOT recover, because once the thin year's
+# beta satisfies the garbage pair it is the many good long-gap pairs that
+# carry the residual and get down-weighted. Such pairs are data errors or
+# non-market transfers (right-to-buy, probate, flips), not house-price
+# signal -- standard repeat-sales practice (Case-Shiller) excludes extreme
+# annualised returns for the same reason. 0.7 log/yr (~2x in a year) keeps
+# any plausible genuine market move; long-gap pairs are still governed by
+# the +/-3.0 cap.
+ANNUALISED_OUTLIER_THRESHOLD = 0.7
 HUBER_K = 1.345
 IRLS_ITERATIONS = 5

@ -111,7 +125,16 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
                / (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
            ).alias("weight"),
        )
-        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
+        .filter(
+            pl.col("log_ratio").abs()
+            <= pl.min_horizontal(
+                pl.lit(OUTLIER_THRESHOLD),
+                ANNUALISED_OUTLIER_THRESHOLD
+                * pl.max_horizontal(
+                    pl.col("frac_year2") - pl.col("frac_year1"), pl.lit(1.0)
+                ),
+            )
+        )
        .collect()
    )

@ -181,11 +204,27 @@ def solve_robust_index(
    # beta=0) has no column, so the penalty spans the non-baseline years only.
    # For cells with <3 betas there is no curvature to penalise and the solve is
    # unchanged.
+    #
+    # The penalty is SUPPORT-SCALED per row: a flat lambda is too weak for
+    # years identified by only 1-2 repeat-sale pairs (a cell can have hundreds
+    # of pairs overall yet single thin years, yielding 2-7x one-year spikes
+    # that cell-level shrinkage cannot catch). Each curvature row's lambda is
+    # lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s), with s the minimum
+    # cross-year pair count among the row's three years, so thin years are
+    # pulled strongly toward the local trend while well-supported years keep
+    # the baseline penalty. Taking the min over the triple (not just the
+    # middle year) also covers thin FIRST/LAST years of the range, which only
+    # ever appear at a triple's edge -- the last solved year feeds the
+    # CURRENT_YEAR trend extrapolation, so spikes there are the costliest.
    n_pen = 0
    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
-        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
+        cross = years1 != years2
+        touched, counts = np.unique(
+            np.concatenate([years1[cross], years2[cross]]), return_counts=True
+        )
+        support = {int(y): int(c) for y, c in zip(touched, counts)}
        years_sorted = sorted(year_to_col)
        cols_by_year = [year_to_col[y] for y in years_sorted]
        n_pen = n_cols - 2
@ -202,6 +241,11 @@ def solve_robust_index(
            w0 = 2.0 / ((y1 - y0) * (y2 - y0))
            w1 = -2.0 / ((y1 - y0) * (y2 - y1))
            w2 = 2.0 / ((y2 - y1) * (y2 - y0))
+            s_k = min(support.get(y, 0) for y in (y0, y1, y2))
+            lam_k = TEMPORAL_SMOOTHNESS_LAMBDA * (
+                1.0 + SMOOTHNESS_SUPPORT_PAIRS / max(s_k, 1)
+            )
+            sqrt_lambda = float(np.sqrt(lam_k))
            pen_vals[3 * k : 3 * k + 3] = (
                sqrt_lambda * w0,
                sqrt_lambda * w1,
@ -347,10 +391,22 @@ def compute_hedonic_index(


 EXTRAPOLATION_YEARS = 3
+# Bound on the per-year slope used to trend-extrapolate beyond the last solved
+# year (the solve stops at LATEST_COMPLETE_YEAR; CURRENT_YEAR is filled here).
+# +/-0.10 log/yr (~+/-10.5%/yr) comfortably covers genuine UK sector-level
+# annual moves while preventing a residual spike in the recent betas from
+# compounding into an absurd extrapolated step (e.g. +49% in one year).
+MAX_EXTRAPOLATION_SLOPE = 0.10


 def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
-    """Forward-fill missing years, with linear extrapolation beyond last known year."""
+    """Forward-fill missing years, with trend extrapolation beyond last known year.
+
+    The extrapolation slope is the MEDIAN of the per-year slopes between
+    consecutive known points in the recent window (a single noisy year corrupts
+    at most one of those slopes, unlike a least-squares fit through all the
+    points), clamped to +/-MAX_EXTRAPOLATION_SLOPE.
+    """
    if not index:
        return {y: 0.0 for y in range(min_year, max_year + 1)}

@ -365,7 +421,7 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
            last = index[y]
        filled[y] = last

-    # Linear extrapolation beyond last known year
+    # Robust trend extrapolation beyond last known year
    if last_known_year < max_year:
        recent = [
            (y, index[y])
@ -373,9 +429,17 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
            if y >= last_known_year - EXTRAPOLATION_YEARS
        ]
        if len(recent) >= 2:
-            years_arr = np.array([r[0] for r in recent], dtype=np.float64)
-            vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
-            slope = np.polyfit(years_arr, vals_arr, 1)[0]
+            slopes = [
+                (v_b - v_a) / (y_b - y_a)
+                for (y_a, v_a), (y_b, v_b) in zip(recent[:-1], recent[1:])
+            ]
+            slope = float(
+                np.clip(
+                    np.median(slopes),
+                    -MAX_EXTRAPOLATION_SLOPE,
+                    MAX_EXTRAPOLATION_SLOPE,
+                )
+            )
            for y in range(last_known_year + 1, max_year + 1):
                filled[y] = index[last_known_year] + slope * (y - last_known_year)
        else:
@ -389,12 +453,16 @@ def build_index(
    input_path: Path,
    max_pair_year: int | None = None,
    postcodes_path: Path | None = None,
+    sectors: list[str] | None = None,
 ) -> pl.DataFrame:
    """Build the full price index from raw data.

    If max_pair_year is set, only pairs before that year are used (backtesting holdout).
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
+    sectors: if provided, restrict the build to these postcode sectors (for
+    debugging/verification runs; hierarchy levels are then computed only from
+    the scoped pairs, so scoped output is NOT identical to a full build).
    """
    # Solve the index only on COMPLETE calendar years: exclude the partial
    # current year, whose thin repeat-sale set yields wild betas. The index is
@ -405,6 +473,9 @@ def build_index(
        max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
    )
    pairs = extract_pairs(input_path, max_year2=estimation_cap)
+    if sectors is not None:
+        pairs = pairs.filter(pl.col("sector").is_in(sectors))
+        print(f"  Scoped to {len(sectors)} sectors: {len(pairs):,} pairs")
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
@ -534,9 +605,21 @@ def main():
        help="Path to postcode.parquet (for lat/lon centroids)",
    )
    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument(
+        "--sectors",
+        type=str,
+        default=None,
+        help="Comma-separated postcode sectors to scope the build to "
+        "(debug/verification only; hierarchy is computed from scoped pairs)",
+    )
    args = parser.parse_args()

-    result = build_index(args.input, postcodes_path=args.postcodes)
+    sectors = (
+        [s.strip() for s in args.sectors.split(",") if s.strip()]
+        if args.sectors
+        else None
+    )
+    result = build_index(args.input, postcodes_path=args.postcodes, sectors=sectors)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
--- a/pipeline/transform/price_estimation/test_index.py
+++ b/pipeline/transform/price_estimation/test_index.py
@ -3,7 +3,10 @@ import polars as pl

 from pipeline.transform.price_estimation import index as index_mod
 from pipeline.transform.price_estimation.index import (
+    MAX_EXTRAPOLATION_SLOPE,
    compute_indices_for_level,
+    extract_pairs,
+    forward_fill,
    solve_robust_index,
 )

@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
    assert abs(idx[2015] - true[2015]) < 0.05


+def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
+    """Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
+    pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
+    years = range(2010, 2021)
+    true = {y: 0.04 * (y - 2010) for y in years}
+    y1, y2, lr, w = [], [], [], []
+    for a in range(2010, 2020):
+        for _ in range(ramp_reps):
+            y1.append(a)
+            y2.append(a + 1)
+            lr.append(true[a + 1] - true[a])
+            w.append(1.0)
+    for _ in range(tail_n):
+        y1.append(2020)
+        y2.append(2021)
+        lr.append(tail_ratio)
+        w.append(1.0)
+    return (
+        np.array(y1, dtype=np.int32),
+        np.array(y2, dtype=np.int32),
+        np.array(lr, dtype=np.float64),
+        np.array(w, dtype=np.float64),
+    )
+
+
+def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
+    """A final year identified by a SINGLE pair claiming a +1.5 log jump is
+    pulled strongly toward the local trend; with the flat baseline penalty
+    (support scaling off) the jump survives almost entirely. The thin year is
+    the LAST year of the range (only ever at a penalty triple's edge), proving
+    the min-over-triple support rule covers range edges -- the last solved year
+    feeds the CURRENT_YEAR trend extrapolation."""
+    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
+
+    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
+    flat = solve_robust_index(y1, y2, lr, w)
+    monkeypatch.undo()
+    scaled = solve_robust_index(y1, y2, lr, w)
+
+    flat_step = flat[2021] - flat[2020]
+    scaled_step = scaled[2021] - scaled[2020]
+    assert flat_step > 1.2  # flat lambda barely resists the spike
+    assert scaled_step < 0.65  # support-scaled lambda suppresses it
+    # The well-supported ramp stays close to truth: the strong penalty row
+    # spanning the thin year drags its immediate neighbour slightly (<0.1)
+    # toward collinearity -- the price of suppressing a x4.5 one-year spike.
+    for y in range(2010, 2021):
+        assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
+
+
+def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
+    """With ample pairs everywhere (support 50-100 per year), lambda_eff ~
+    lambda0 and the solution matches the flat-penalty solve to <1e-3."""
+    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
+
+    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
+    flat = solve_robust_index(y1, y2, lr, w)
+    monkeypatch.undo()
+    scaled = solve_robust_index(y1, y2, lr, w)
+
+    assert set(flat) == set(scaled)
+    assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
+
+
+def test_forward_fill_extrapolation_uses_robust_median_slope():
+    """A residual spike in ONE recent year must not corrupt the extrapolated
+    step: the median of consecutive per-year slopes ignores it (a least-squares
+    fit through the same points would extrapolate a large positive slope)."""
+    index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
+    filled = forward_fill(index, 2022, 2026)
+    # slopes: [+0.05, +0.55, -0.50] -> median +0.05
+    assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
+
+
+def test_forward_fill_extrapolated_slope_is_clamped():
+    """A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
+    index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
+    filled = forward_fill(index, 2022, 2026)
+    assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
+
+    index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
+    filled_down = forward_fill(index_down, 2022, 2026)
+    assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
+
+
+def test_forward_fill_preserves_sane_trend_and_flat_fallback():
+    """Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
+    trend); with <2 recent points the fill is flat."""
+    index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
+    filled = forward_fill(index, 2022, 2026)
+    assert abs(filled[2026] - 1.20) < 1e-9
+
+    assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
+
+
+def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
+    """A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
+    error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
+    Such pairs are dropped via the annualised cap; large ratios over long
+    holding periods (genuine appreciation) are kept."""
+    df = pl.DataFrame(
+        {
+            "Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
+            "Property type": ["Detached", "Detached", "Detached"],
+            "historical_prices": [
+                # +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
+                [
+                    {"year": 2020, "month": 1, "price": 100_000},
+                    {"year": 2020, "month": 7, "price": 1_000_000},
+                ],
+                # +2.20 log over 24 years -> kept (flat 3.0 cap governs)
+                [
+                    {"year": 2000, "month": 1, "price": 100_000},
+                    {"year": 2024, "month": 1, "price": 900_000},
+                ],
+                # +0.41 log in 1 year -> kept (within the 0.7/yr band)
+                [
+                    {"year": 2020, "month": 1, "price": 100_000},
+                    {"year": 2021, "month": 1, "price": 150_000},
+                ],
+            ],
+        }
+    )
+    path = tmp_path / "props.parquet"
+    df.write_parquet(path)
+
+    pairs = extract_pairs(path)
+
+    assert len(pairs) == 2
+    ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
+    assert ratios == [0.41, 2.2]
+
+
 def test_n_pairs_counts_only_cross_year_pairs():
    """FIX #12: same-year pairs carry zero index information and must not inflate
    the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -36,6 +36,20 @@ SHRINKAGE_K = 50
 # noisy year) without flattening genuine multi-year trends.
 TEMPORAL_SMOOTHNESS_LAMBDA = 0.05

+# Per-year support scaling for the temporal smoothness penalty. A flat lambda
+# is too weak for years with very few repeat-sale pairs: a sector can have
+# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
+# yet have individual years estimated from 1-2 pairs, producing 2-7x
+# single-year index spikes. Each curvature row is therefore scaled by the
+# local pair support of its year triple:
+#   lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
+# where s is the minimum cross-year pair count among the triple's years.
+# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
+# lambda0 (current behaviour); a year identified by a single pair gets
+# ~41x lambda0, pulling its beta strongly toward the local trend through its
+# neighbours. Same-year pairs cancel in the design and are not counted.
+SMOOTHNESS_SUPPORT_PAIRS = 40
+

 def type_group_expr():
    """Polars expression: Property type -> type_group."""
--- a/pipeline/transform/school_catchments.py
+++ b/pipeline/transform/school_catchments.py
@ -0,0 +1,748 @@
+"""Model historical school catchment areas and count them per postcode.
+
+No national dataset of school catchment areas exists for England: catchments
+are set per admission authority, only a handful of councils publish polygons,
+and the pupil-residence data behind commercial "heatmap" catchments lives in
+the restricted National Pupil Database. This module therefore COMPILES one
+from open data, estimating each school's admission cutoff distance ("last
+distance offered") — the radius within which an applicant would plausibly be
+offered a place.
+
+Model: English state admissions are run as deferred acceptance with distance
+tie-breaks, which in a continuum economy is equivalent to finding
+market-clearing cutoff distances (Azevedo & Leshno 2016). Per phase
+(primary/secondary):
+
+1. Demand — Census 2021 children per LSOA (TS007A age bands, prorated to the
+   phase's cohort ages) split evenly across the LSOA's live postcodes.
+2. Supply — every open, non-selective state-funded school (GIAS), with a fill
+   target of max(capacity, headcount) prorated to the phase's cohorts
+   (sixth-form and nursery years carry reduced weight, since their class
+   sizes differ and they are not allocated by the same admissions round).
+3. Preferences — children prefer nearby schools, trading distance against
+   Ofsted grade: a school's effective distance is its real distance minus a
+   grade bonus (Outstanding > Good > ungraded > below-Good). Because real
+   first preferences are heterogeneous, each postcode's children split
+   across nearby feasible schools with logit weights over effective
+   distance rather than all picking the same one.
+4. Equilibrium — cutoffs start unbounded and tighten monotonically: each
+   round, children apply to their preferred feasible school(s), and
+   oversubscribed schools tighten their cutoff to the distance of their
+   marginal admitted child. Converges to the deferred-acceptance outcome.
+5. Schools that never fill have no binding cutoff — anyone who applies gets
+   in — so their feasibility radius is the distance within which the local
+   child population would cover their fill target, capped.
+
+The free parameters (preference bonuses, demand scale, choice temperature,
+residual calibration factors) are CALIBRATED against published "last
+distance offered" figures scraped from nine local authorities' allocation
+reports — see check_school_cutoffs.py and the constants below.
+
+A postcode is "inside the catchment" of every school whose cutoff radius
+covers it. The output counts those schools per postcode for the four
+good+/outstanding x primary/secondary categories (Ofsted-classified, same
+rules as the previous proximity metric). Selective (grammar) schools are
+excluded throughout: their intakes are test-based and region-wide, so a
+distance model would fabricate a catchment that does not exist.
+
+Known limitations: faith oversubscription criteria are not modelled (whether
+a faith school's catchment is open to a given family depends on the family),
+and Census 2021 child counts lag current rolls slightly. Cutoffs are
+straight-line distances, the modal LA tie-break criterion.
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import cKDTree
+
+from pipeline.utils.poi_counts import _project_lat_lng_km, valid_uk_coords_mask
+
+SCHOOL_GROUPS = {
+    "good_primary": ["good_primary", "outstanding_primary"],
+    "good_secondary": ["good_secondary", "outstanding_secondary"],
+    "outstanding_primary": ["outstanding_primary"],
+    "outstanding_secondary": ["outstanding_secondary"],
+}
+
+# Age thresholds for deciding which phase(s) a school serves. A school serves
+# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
+# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
+# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
+# both the primary and the secondary metrics — Ofsted's coarse "Ofsted phase"
+# labels such schools as just "Secondary", which previously hid them from every
+# postcode's primary-school count.
+PRIMARY_MAX_AGE = 10
+SECONDARY_MIN_AGE = 12
+
+# Cohort ages (inclusive) each phase competes for: Reception-Y6 and Y7-Y11.
+PRIMARY_AGES = (4, 10)
+SECONDARY_AGES = (11, 15)
+
+# Cohort weights for prorating a school's headcount/capacity across the ages
+# it teaches. Nursery classes are typically part-time and small; sixth forms
+# run at roughly 60% of a school's Y7-Y11 cohort size. A flat proration
+# undersupplied secondary places by ~8%.
+NURSERY_COHORT_WEIGHT = 0.5  # ages < 4
+SIXTH_FORM_COHORT_WEIGHT = 0.6  # ages >= 16
+
+# Only schools that admit (mostly) by geography take part in the assignment.
+# Independent, special and Welsh schools and post-16 colleges either don't
+# admit by distance or fall outside the England postcode universe; selective
+# (grammar) schools admit by test from a wide region.
+STATE_SCHOOL_TYPE_GROUPS = [
+    "Academies",
+    "Local authority maintained schools",
+    "Free Schools",
+]
+
+# Preference bonuses (km of extra travel a family accepts for a better
+# school), applied as a discount on effective distance when children choose.
+# Grade 3/4 schools repel by the same magnitudes.
+PREF_BONUS_OUTSTANDING_KM = 0.6
+PREF_BONUS_GOOD_KM = 0.3
+
+# Share of resident children who actually compete for state places. Census
+# 2021 counts overstate current entry cohorts (birth rates fell ~10% between
+# 2016 and 2021, which is exactly the gap between the census stock and the
+# children reaching Reception by mid-decade) and independent/home-educated
+# children (~7%) never enter the allocation at all. Without this, modelled
+# cutoffs run systematically tight and undersubscribed schools look full.
+DEMAND_SCALE = 0.8
+
+# Logit choice temperature (km). With deterministic choice every child at a
+# postcode ranks the same school first, so popular schools fill entirely from
+# their nearest band and the marginal admitted child sits unrealistically
+# close. Real first preferences are heterogeneous; a school draws only a
+# distance-decaying share of nearby families. Children therefore split across
+# nearby feasible schools with weights softmax(-effective_distance / tau):
+# higher tau = more smearing = wider cutoffs. tau -> 0 recovers the
+# deterministic model (used by the unit tests). Calibrated 2026-06 against
+# 240 published binding cutoffs from 9 LAs (check_school_cutoffs.py): 0.3 km
+# maximises rank correlation and within-2x share; beyond ~0.6 the smearing
+# erases school-to-school differentiation (Spearman 0.24 -> 0.01).
+CHOICE_TEMPERATURE_KM = 0.3
+
+# Residual calibration from the same ground truth: after the equilibrium
+# solve, modelled cutoffs still ran systematically tight (median log2 bias
+# -0.53 primary / -0.36 secondary at the settings above — published "last
+# distance offered" reflects offer-day frictions, waiting-list churn and
+# furthest-applicant noise that no clean equilibrium reproduces). Radii are
+# multiplied by 2^-bias so the modelled median matches the published median;
+# rank ordering is unaffected.
+CUTOFF_CALIBRATION_FACTOR = {"primary": 1.44, "secondary": 1.28}
+
+# Each demand postcode considers this many nearest schools; beyond ~16
+# candidates assignment shares are negligible.
+NEAREST_SCHOOL_CANDIDATES = 16
+
+# Radius guard rails: the floor absorbs postcode-centroid noise around tiny
+# urban catchments; the cap bounds feasibility radii for schools the model
+# never fills (mostly rural).
+MIN_RADIUS_KM = 0.3
+MAX_RADIUS_KM = 25.0
+
+EQUILIBRIUM_MAX_ITER = 100
+
+
+def classify_good_plus_schools(
+    ofsted: pl.DataFrame, open_urns: set[int] | None = None
+) -> pl.DataFrame:
+    """Label good+/outstanding primary & secondary schools for catchment counts.
+
+    Derives a grade ("1" = outstanding, "2" = good) and one or two
+    ``category`` rows per school, returning a ``(urn, category)`` frame.
+
+    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
+    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
+    Framework). A large and growing share of schools were last inspected under an
+    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
+    that column is null/"Not judged" for them even when they are demonstrably
+    good — their status lives in "Ungraded inspection overall outcome" ("School
+    remains Good"/"School remains Outstanding"). Filtering on the graded column
+    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
+    ungraded outcome, but ONLY when there is no usable graded result
+    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
+
+    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
+    (Concerns)" outcome signals inspectors found issues warranting an earlier
+    graded re-inspection, so marketing it as a good+ school is misleading.
+
+    Phase assignment uses the statutory age range when available (so all-through
+    and middle schools count toward BOTH primary and secondary), falling back to
+    the coarse "Ofsted phase" label when age columns are absent. When
+    ``open_urns`` is given, schools whose URN is not in the current GIAS open
+    register are dropped so closed/merged schools are not counted.
+    """
+    graded = _with_derived_grade(ofsted).filter(
+        pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
+        & pl.col("_ofsted_grade").is_in(["1", "2"])
+    )
+
+    # Drop schools no longer open (closed/merged) when the GIAS open register is
+    # provided, so stale Ofsted "latest inspection" rows are not counted.
+    if open_urns is not None and "URN" in graded.columns:
+        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
+
+    # Decide which phase(s) each school serves.
+    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
+        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
+        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
+        serves_primary = (
+            pl.when(low.is_not_null())
+            .then(low <= PRIMARY_MAX_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Primary")
+        )
+        serves_secondary = (
+            pl.when(high.is_not_null())
+            .then(high >= SECONDARY_MIN_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Secondary")
+        )
+    else:
+        serves_primary = pl.col("Ofsted phase") == "Primary"
+        serves_secondary = pl.col("Ofsted phase") == "Secondary"
+
+    graded = graded.with_columns(
+        serves_primary.alias("_serves_primary"),
+        serves_secondary.alias("_serves_secondary"),
+    )
+
+    # Good+ groups include both grade variants; outstanding groups count grade 1.
+    # A school can yield up to two rows (primary and secondary).
+    primary = graded.filter(pl.col("_serves_primary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_primary"))
+        .otherwise(pl.lit("good_primary"))
+        .alias("category")
+    )
+    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_secondary"))
+        .otherwise(pl.lit("good_secondary"))
+        .alias("category")
+    )
+    return pl.concat([primary, secondary]).select(
+        pl.col("URN").cast(pl.Int64).alias("urn"),
+        "category",
+    )
+
+
+def _with_derived_grade(ofsted: pl.DataFrame) -> pl.DataFrame:
+    """Attach ``_ofsted_grade`` ("1"-"4" or null): graded OEIF result first,
+    falling back to ungraded "School remains Good/Outstanding" outcomes (minus
+    "(Concerns)") only when there is no usable graded result."""
+    # Cast to Utf8 so the string predicates below are well-defined even if a
+    # column happens to be entirely null (read back as a Null dtype).
+    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
+    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
+    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    has_concern = ungraded.str.contains(r"\(Concerns\)")
+    remains_outstanding = (
+        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
+    )
+    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
+    return ofsted.with_columns(
+        pl.when(oeif.is_in(["1", "2", "3", "4"]))
+        .then(oeif)
+        .when(no_usable_grade & remains_outstanding)
+        .then(pl.lit("1"))
+        .when(no_usable_grade & remains_good)
+        .then(pl.lit("2"))
+        .otherwise(None)
+        .alias("_ofsted_grade")
+    )
+
+
+def school_preference_bonuses(
+    ofsted: pl.DataFrame,
+    bonus_outstanding_km: float = PREF_BONUS_OUTSTANDING_KM,
+    bonus_good_km: float = PREF_BONUS_GOOD_KM,
+) -> pl.DataFrame:
+    """Per-school preference bonus in km, from the derived Ofsted grade.
+
+    Outstanding/Good schools attract demand from further away; grade 3/4
+    schools repel it symmetrically. Ungraded (typically new) schools are
+    neutral. Returns ``(urn, bonus_km)`` with one row per URN.
+    """
+    bonus = {
+        "1": bonus_outstanding_km,
+        "2": bonus_good_km,
+        "3": -bonus_good_km,
+        "4": -bonus_outstanding_km,
+    }
+    return (
+        _with_derived_grade(ofsted)
+        .filter(pl.col("URN").is_not_null())
+        .select(
+            pl.col("URN").cast(pl.Int64).alias("urn"),
+            pl.col("_ofsted_grade")
+            .replace_strict(bonus, default=0.0, return_dtype=pl.Float64)
+            .alias("bonus_km"),
+        )
+        .unique(subset="urn", keep="first")
+    )
+
+
+def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
+    """Per-school phase-prorated fill targets for the admissions model.
+
+    Returns one row per open, non-selective state-funded school with valid
+    coordinates: ``(urn, lat, lng, primary_intake, secondary_intake)``. The
+    fill target — max(capacity, headcount), so over-full schools keep their
+    demonstrated size and under-full schools can admit up to capacity — is
+    spread over the cohort ages the school teaches (parsed from ``age_range``,
+    e.g. "3–11" = ages 3..10) with nursery and sixth-form ages down-weighted,
+    and each phase receives the share of cohort weight in its age band.
+    """
+    ages = pl.col("age_range").str.extract_all(r"\d+")
+    low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
+    # The leaving age is exclusive as a cohort: a "3-11" school teaches
+    # children aged 3 through 10.
+    high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
+
+    schools = (
+        gias.filter(
+            pl.col("type_group").is_in(STATE_SCHOOL_TYPE_GROUPS)
+            & (
+                pl.col("admissions_policy").is_null()
+                | (pl.col("admissions_policy") != "Selective")
+            )
+            & pl.col("lat").is_not_null()
+            & pl.col("lng").is_not_null()
+        )
+        .with_columns(low.alias("_low"), high.alias("_high"))
+        .filter(pl.col("_low").is_not_null() & (pl.col("_high") >= pl.col("_low")))
+        .with_columns(
+            pl.max_horizontal(
+                pl.col("pupils").fill_null(0), pl.col("capacity").fill_null(0)
+            )
+            .cast(pl.Float64)
+            .alias("_fill_target"),
+        )
+        .filter(pl.col("_fill_target") > 0)
+    )
+
+    def weighted_overlap(lo: int, hi: int, weight: float = 1.0) -> pl.Expr:
+        """Cohort weight contributed by ages [lo, hi] within [_low, _high]."""
+        return (
+            weight
+            * (
+                pl.min_horizontal(pl.col("_high"), hi)
+                - pl.max_horizontal(pl.col("_low"), lo)
+                + 1
+            ).clip(lower_bound=0)
+        ).cast(pl.Float64)
+
+    total_weight = (
+        weighted_overlap(0, 3, NURSERY_COHORT_WEIGHT)
+        + weighted_overlap(4, 15)
+        + weighted_overlap(16, 30, SIXTH_FORM_COHORT_WEIGHT)
+    )
+    return schools.select(
+        pl.col("urn").cast(pl.Int64),
+        "lat",
+        "lng",
+        (pl.col("_fill_target") * weighted_overlap(*PRIMARY_AGES) / total_weight).alias(
+            "primary_intake"
+        ),
+        (
+            pl.col("_fill_target") * weighted_overlap(*SECONDARY_AGES) / total_weight
+        ).alias("secondary_intake"),
+    )
+
+
+def children_per_postcode(
+    postcodes: pl.DataFrame, lsoa_children: pl.DataFrame
+) -> pl.DataFrame:
+    """Estimate phase-age children living at each live postcode.
+
+    Census age bands don't align with school phases, so phase totals take
+    fractional shares of bands (one fifth per single year of age): primary
+    (4-10) = age 4 + ages 5-9 + age 10, secondary (11-15) = ages 11-14 +
+    age 15. LSOA totals are then split evenly across the LSOA's postcodes.
+    """
+    lsoa = lsoa_children.select(
+        "lsoa21",
+        (
+            0.2 * pl.col("aged_0_4") + pl.col("aged_5_9") + 0.2 * pl.col("aged_10_14")
+        ).alias("_lsoa_primary"),
+        (0.8 * pl.col("aged_10_14") + 0.2 * pl.col("aged_15_19")).alias(
+            "_lsoa_secondary"
+        ),
+    )
+    return (
+        postcodes.join(lsoa, left_on="lsoa21cd", right_on="lsoa21", how="inner")
+        .with_columns(pl.len().over("lsoa21cd").alias("_lsoa_postcodes"))
+        .select(
+            "postcode",
+            "lat",
+            "lng",
+            (pl.col("_lsoa_primary") / pl.col("_lsoa_postcodes")).alias(
+                "primary_children"
+            ),
+            (pl.col("_lsoa_secondary") / pl.col("_lsoa_postcodes")).alias(
+                "secondary_children"
+            ),
+        )
+    )
+
+
+def equilibrium_cutoffs(
+    school_xy: np.ndarray,
+    fill_target: np.ndarray,
+    bonus_km: np.ndarray,
+    pc_xy: np.ndarray,
+    pc_children: np.ndarray,
+    k: int = NEAREST_SCHOOL_CANDIDATES,
+    max_iter: int = EQUILIBRIUM_MAX_ITER,
+    tau_km: float = CHOICE_TEMPERATURE_KM,
+) -> np.ndarray:
+    """Market-clearing admission cutoff distance (km) per school.
+
+    Deferred acceptance with distance priority, solved as cutoff dynamics
+    (Azevedo & Leshno): cutoffs start unbounded; each round every child unit
+    applies to its preferred feasible school(s) — a logit split over
+    effective distance (distance - school bonus) among schools whose cutoff
+    covers it, collapsing to the single best school when ``tau_km`` is 0 —
+    and each oversubscribed school tightens its cutoff to its marginal
+    admitted child's distance. Cutoffs only ever tighten, so the iteration
+    converges.
+
+    Returns np.inf for schools that never fill (no binding cutoff).
+    """
+    n_schools = len(school_xy)
+    k = min(k, n_schools)
+    demand = np.flatnonzero(pc_children > 0)
+    weights = pc_children[demand]
+    tree = cKDTree(school_xy)
+    dist, cand = tree.query(pc_xy[demand], k=k, workers=-1)
+    if k == 1:
+        dist = dist[:, None]
+        cand = cand[:, None]
+    eff = dist - bonus_km[cand]
+
+    rows = np.arange(len(demand))
+    cutoff = np.full(n_schools, np.inf)
+    for _ in range(max_iter):
+        eff_feasible = np.where(dist <= cutoff[cand], eff, np.inf)
+        if tau_km <= 0:
+            choice = np.argmin(eff_feasible, axis=1)
+            valid = np.isfinite(eff_feasible[rows, choice])
+            chosen_school = cand[rows[valid], choice[valid]]
+            chosen_dist = dist[rows[valid], choice[valid]]
+            chosen_mass = weights[valid]
+        else:
+            z = -eff_feasible / tau_km
+            z_max = z.max(axis=1, keepdims=True)
+            share = np.exp(z - np.where(np.isfinite(z_max), z_max, 0.0))
+            share[~np.isfinite(eff_feasible)] = 0.0
+            total = share.sum(axis=1, keepdims=True)
+            mass = weights[:, None] * share / np.where(total > 0, total, 1.0)
+            # Sub-thousandth-of-a-child applications only slow the sort down.
+            keep = mass > 1e-3
+            chosen_school = cand[keep]
+            chosen_dist = dist[keep]
+            chosen_mass = mass[keep]
+
+        order = np.lexsort((chosen_dist, chosen_school))
+        s_sorted = chosen_school[order]
+        d_sorted = chosen_dist[order]
+        m_cum = np.cumsum(chosen_mass[order])
+        boundaries = np.flatnonzero(np.diff(s_sorted)) + 1
+        starts = np.concatenate(([0], boundaries))
+        ends = np.concatenate((boundaries, [len(s_sorted)]))
+
+        changed = False
+        for start, end in zip(starts, ends):
+            school = s_sorted[start]
+            seg_cum = m_cum[start:end] - (m_cum[start - 1] if start else 0.0)
+            if seg_cum[-1] <= fill_target[school]:
+                continue
+            marginal = d_sorted[start + np.searchsorted(seg_cum, fill_target[school])]
+            if marginal < cutoff[school]:
+                cutoff[school] = marginal
+                changed = True
+        if not changed:
+            break
+
+    return cutoff
+
+
+def capacity_fill_radii(
+    school_xy: np.ndarray,
+    fill_target: np.ndarray,
+    pc_xy: np.ndarray,
+    pc_children: np.ndarray,
+    max_radius_km: float = MAX_RADIUS_KM,
+) -> np.ndarray:
+    """Feasibility radius for schools without a binding cutoff.
+
+    An undersubscribed school admits anyone who applies, so its catchment is
+    bounded by plausibility rather than competition: the distance within
+    which the local child population would cover its fill target. Capped at
+    ``max_radius_km``.
+    """
+    demand = np.flatnonzero(pc_children > 0)
+    tree = cKDTree(pc_xy[demand])
+    radii = np.full(len(school_xy), max_radius_km)
+    k = min(4096, len(demand))
+    for i in range(len(school_xy)):
+        dists, idx = tree.query(
+            school_xy[i], k=k, distance_upper_bound=max_radius_km
+        )
+        found = np.isfinite(dists)
+        cum = np.cumsum(pc_children[demand[idx[found]]])
+        if len(cum) and cum[-1] >= fill_target[i]:
+            radii[i] = dists[found][np.searchsorted(cum, fill_target[i])]
+    return radii
+
+
+def count_covering_catchments(
+    pc_xy: np.ndarray,
+    pc_valid: np.ndarray,
+    school_xy: np.ndarray,
+    school_radii: np.ndarray,
+    n_postcodes: int,
+) -> np.ndarray:
+    """Count, per postcode, how many schools' catchment radii cover it."""
+    counts = np.zeros(n_postcodes, dtype=np.int32)
+    if len(school_xy) == 0:
+        return counts
+    valid_indices = np.flatnonzero(pc_valid)
+    tree = cKDTree(pc_xy[valid_indices])
+    covered = np.zeros(len(valid_indices), dtype=np.int32)
+    for indices in tree.query_ball_point(school_xy, school_radii, workers=-1):
+        covered[indices] += 1
+    counts[valid_indices] = covered
+    return counts
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Model school admission cutoff radii and count good+/outstanding "
+            "primary/secondary catchments covering each postcode"
+        )
+    )
+    parser.add_argument(
+        "--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
+    )
+    parser.add_argument(
+        "--gias", type=Path, required=True, help="GIAS open-school parquet"
+    )
+    parser.add_argument(
+        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
+    )
+    parser.add_argument(
+        "--lsoa-children",
+        type=Path,
+        required=True,
+        help="Census 2021 children by LSOA parquet",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Per-postcode counts parquet; omit for calibration runs that only "
+        "need --schools-output",
+    )
+    parser.add_argument(
+        "--schools-output",
+        type=Path,
+        default=None,
+        help="Optional per-school catchment radii parquet (for calibration/debugging)",
+    )
+    parser.add_argument(
+        "--bonus-outstanding-km",
+        type=float,
+        default=PREF_BONUS_OUTSTANDING_KM,
+        help="Preference bonus for Outstanding schools (calibration sweeps)",
+    )
+    parser.add_argument(
+        "--bonus-good-km",
+        type=float,
+        default=PREF_BONUS_GOOD_KM,
+        help="Preference bonus for Good schools (calibration sweeps)",
+    )
+    parser.add_argument(
+        "--demand-scale",
+        type=float,
+        default=DEMAND_SCALE,
+        help="Share of resident children competing for state places",
+    )
+    parser.add_argument(
+        "--choice-temperature-km",
+        type=float,
+        default=CHOICE_TEMPERATURE_KM,
+        help="Logit choice temperature over effective distance",
+    )
+    args = parser.parse_args()
+
+    gias = pl.read_parquet(args.gias)
+    open_urns = set(
+        gias.select(pl.col("urn").cast(pl.Int64, strict=False))
+        .to_series()
+        .drop_nulls()
+        .to_list()
+    )
+    print(f"GIAS open register: {len(open_urns):,} open school URNs")
+
+    ofsted = pl.read_parquet(args.ofsted)
+    rated = classify_good_plus_schools(ofsted, open_urns=open_urns)
+    if rated.is_empty():
+        raise ValueError("No good+ primary/secondary Ofsted schools found")
+    print(f"Good+ school/phase rows: {len(rated):,}")
+
+    supply = phase_intakes(gias).join(
+        school_preference_bonuses(
+            ofsted,
+            bonus_outstanding_km=args.bonus_outstanding_km,
+            bonus_good_km=args.bonus_good_km,
+        ),
+        on="urn",
+        how="left",
+    ).with_columns(pl.col("bonus_km").fill_null(0.0))
+    print(f"State schools in admissions model: {len(supply):,}")
+
+    arcgis = pl.read_parquet(args.arcgis).select(
+        pl.col("pcds").alias("postcode"),
+        "lat",
+        pl.col("long").alias("lng"),
+        "lsoa21cd",
+        "doterm",
+    )
+    live = arcgis.filter(
+        pl.col("doterm").is_null() & pl.col("lsoa21cd").str.starts_with("E")
+    )
+    demand = children_per_postcode(live, pl.read_parquet(args.lsoa_children))
+    print(
+        f"Demand postcodes: {len(demand):,} "
+        f"({demand['primary_children'].sum():,.0f} primary-age, "
+        f"{demand['secondary_children'].sum():,.0f} secondary-age children)"
+    )
+
+    # Shared local-km projection so assignment and coverage use one metric.
+    pc_lats = arcgis["lat"].to_numpy()
+    pc_lngs = arcgis["lng"].to_numpy()
+    pc_valid = valid_uk_coords_mask(pc_lats, pc_lngs)
+    origin_lat = float(np.mean(pc_lats[pc_valid]))
+    pc_xy = _project_lat_lng_km(pc_lats, pc_lngs, origin_lat)
+
+    demand_lats = demand["lat"].to_numpy()
+    demand_lngs = demand["lng"].to_numpy()
+    demand_valid = valid_uk_coords_mask(demand_lats, demand_lngs)
+    demand_xy = _project_lat_lng_km(demand_lats, demand_lngs, origin_lat)
+
+    school_xy = _project_lat_lng_km(
+        supply["lat"].to_numpy(), supply["lng"].to_numpy(), origin_lat
+    )
+
+    radii = {}
+    for phase in ("primary", "secondary"):
+        in_phase = supply[f"{phase}_intake"].to_numpy() > 0
+        targets = supply[f"{phase}_intake"].to_numpy()[in_phase]
+        xy = school_xy[in_phase]
+        children = np.where(
+            demand_valid,
+            demand[f"{phase}_children"].to_numpy() * args.demand_scale,
+            0.0,
+        )
+        print(f"Solving {phase} admissions for {in_phase.sum():,} schools...")
+        cutoffs = equilibrium_cutoffs(
+            xy,
+            targets,
+            supply["bonus_km"].to_numpy()[in_phase],
+            demand_xy,
+            children,
+            tau_km=args.choice_temperature_km,
+        )
+        filled = np.isfinite(cutoffs)
+        print(
+            f"  {filled.sum():,} schools have binding cutoffs "
+            f"(median {np.median(cutoffs[filled]):.2f} km); "
+            f"{(~filled).sum():,} undersubscribed"
+        )
+        fallback = capacity_fill_radii(
+            xy[~filled], targets[~filled], demand_xy, children
+        )
+        raw = cutoffs.copy()
+        raw[~filled] = fallback
+        radii[phase] = pl.DataFrame(
+            {
+                "urn": supply["urn"].to_numpy()[in_phase],
+                "phase": phase,
+                "cutoff_km": raw,
+                "filled": filled,
+                "radius_km": np.clip(
+                    raw * CUTOFF_CALIBRATION_FACTOR[phase],
+                    MIN_RADIUS_KM,
+                    MAX_RADIUS_KM,
+                ),
+            }
+        )
+        print(
+            f"  radius km: median {radii[phase]['radius_km'].median():.2f}, "
+            f"p90 {radii[phase]['radius_km'].quantile(0.9):.2f}"
+        )
+
+    # Attach each rated school's phase radius; rated schools outside the
+    # admissions model (special schools, selective schools, missing
+    # headcounts) cannot be given a defensible radius and are dropped.
+    rated = rated.with_columns(
+        pl.col("category").str.split("_").list.get(1).alias("phase")
+    )
+    rated_with_radius = rated.join(
+        pl.concat(list(radii.values())), on=["urn", "phase"], how="inner"
+    ).join(supply.select("urn", "lat", "lng"), on="urn", how="inner")
+    dropped = len(rated) - len(rated_with_radius)
+    print(
+        f"Rated school/phase rows with radii: {len(rated_with_radius):,} "
+        f"(dropped {dropped:,}, incl. selective schools)"
+    )
+
+    if args.output is None and args.schools_output is None:
+        raise SystemExit("Provide --output and/or --schools-output")
+
+    if args.output is not None:
+        category_counts = {}
+        for category in set(c for cats in SCHOOL_GROUPS.values() for c in cats):
+            cat = rated_with_radius.filter(pl.col("category") == category)
+            cat_xy = _project_lat_lng_km(
+                cat["lat"].to_numpy(), cat["lng"].to_numpy(), origin_lat
+            )
+            category_counts[category] = count_covering_catchments(
+                pc_xy, pc_valid, cat_xy, cat["radius_km"].to_numpy(), len(arcgis)
+            )
+            print(f"  {category}: {len(cat):,} schools")
+
+        result = pl.DataFrame(
+            {
+                "postcode": arcgis["postcode"],
+                **{
+                    f"{group}_catchments": sum(category_counts[c] for c in categories)
+                    for group, categories in SCHOOL_GROUPS.items()
+                },
+            }
+        )
+        for group in SCHOOL_GROUPS:
+            col = result[f"{group}_catchments"]
+            print(f"  {group}_catchments: mean {col.mean():.2f}, max {col.max()}")
+
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        result.write_parquet(args.output)
+        size_mb = args.output.stat().st_size / (1024 * 1024)
+        print(f"Wrote {args.output} ({size_mb:.1f} MB)")
+
+    if args.schools_output is not None:
+        schools_out = rated_with_radius.select(
+            "urn", "category", "phase", "cutoff_km", "filled", "radius_km", "lat", "lng"
+        )
+        args.schools_output.parent.mkdir(parents=True, exist_ok=True)
+        schools_out.write_parquet(args.schools_output)
+        print(f"Wrote {args.schools_output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -1,199 +0,0 @@
-"""Compute Ofsted-rated school proximity counts per postcode."""
-
-import argparse
-from pathlib import Path
-
-import polars as pl
-
-from pipeline.utils.poi_counts import count_pois_per_postcode
-
-SCHOOL_GROUPS = {
-    "good_primary": ["good_primary", "outstanding_primary"],
-    "good_secondary": ["good_secondary", "outstanding_secondary"],
-    "outstanding_primary": ["outstanding_primary"],
-    "outstanding_secondary": ["outstanding_secondary"],
-}
-
-
-# Age thresholds for deciding which phase(s) a school serves. A school serves
-# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
-# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
-# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
-# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
-# phase" labels such schools as just "Secondary", which previously hid them from
-# every postcode's primary-school count.
-PRIMARY_MAX_AGE = 10
-SECONDARY_MIN_AGE = 12
-
-
-def classify_good_plus_schools(
-    ofsted: pl.DataFrame, open_urns: set[int] | None = None
-) -> pl.DataFrame:
-    """Label good+/outstanding primary & secondary schools for proximity counts.
-
-    Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
-    ``category`` rows per school, returning a ``(postcode, category)`` frame.
-
-    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
-    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
-    Framework). A large and growing share of schools were last inspected under an
-    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
-    that column is null/"Not judged" for them even when they are demonstrably
-    good — their status lives in "Ungraded inspection overall outcome" ("School
-    remains Good"/"School remains Outstanding"). Filtering on the graded column
-    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
-    ungraded outcome, but ONLY when there is no usable graded result
-    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
-
-    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
-    (Concerns)" outcome signals inspectors found issues warranting an earlier
-    graded re-inspection, so marketing it as a good+ school is misleading.
-
-    Phase assignment uses the statutory age range when available (so all-through
-    and middle schools count toward BOTH primary and secondary), falling back to
-    the coarse "Ofsted phase" label when age columns are absent. When
-    ``open_urns`` is given, schools whose URN is not in the current GIAS open
-    register are dropped so closed/merged schools are not counted.
-    """
-    # Cast to Utf8 so the string predicates below are well-defined even if a
-    # column happens to be entirely null (read back as a Null dtype).
-    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
-    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
-    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
-    has_concern = ungraded.str.contains(r"\(Concerns\)")
-    remains_outstanding = (
-        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
-    )
-    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
-    graded = (
-        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
-        .with_columns(
-            pl.when(oeif.is_in(["1", "2"]))
-            .then(oeif)
-            .when(no_usable_grade & remains_outstanding)
-            .then(pl.lit("1"))
-            .when(no_usable_grade & remains_good)
-            .then(pl.lit("2"))
-            .otherwise(None)
-            .alias("_ofsted_grade")
-        )
-        .filter(pl.col("_ofsted_grade").is_not_null())
-    )
-
-    # Drop schools no longer open (closed/merged) when the GIAS open register is
-    # provided, so stale Ofsted "latest inspection" rows are not counted.
-    if open_urns is not None and "URN" in graded.columns:
-        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
-
-    # Decide which phase(s) each school serves.
-    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
-        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
-        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
-        serves_primary = (
-            pl.when(low.is_not_null())
-            .then(low <= PRIMARY_MAX_AGE)
-            .otherwise(pl.col("Ofsted phase") == "Primary")
-        )
-        serves_secondary = (
-            pl.when(high.is_not_null())
-            .then(high >= SECONDARY_MIN_AGE)
-            .otherwise(pl.col("Ofsted phase") == "Secondary")
-        )
-    else:
-        serves_primary = pl.col("Ofsted phase") == "Primary"
-        serves_secondary = pl.col("Ofsted phase") == "Secondary"
-
-    graded = graded.with_columns(
-        serves_primary.alias("_serves_primary"),
-        serves_secondary.alias("_serves_secondary"),
-    )
-
-    # Good+ groups include both grade variants; outstanding groups count grade 1.
-    # A school can yield up to two rows (primary and secondary).
-    primary = graded.filter(pl.col("_serves_primary")).with_columns(
-        pl.when(pl.col("_ofsted_grade") == "1")
-        .then(pl.lit("outstanding_primary"))
-        .otherwise(pl.lit("good_primary"))
-        .alias("category")
-    )
-    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
-        pl.when(pl.col("_ofsted_grade") == "1")
-        .then(pl.lit("outstanding_secondary"))
-        .otherwise(pl.lit("good_secondary"))
-        .alias("category")
-    )
-    return pl.concat([primary, secondary]).select(
-        pl.col("Postcode").alias("postcode"),
-        "category",
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Count good+ and outstanding primary/secondary schools near each postcode"
-    )
-    parser.add_argument(
-        "--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
-    )
-    parser.add_argument(
-        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
-    )
-    parser.add_argument(
-        "--gias",
-        type=Path,
-        default=None,
-        help="GIAS open-school parquet; if given, only currently-open schools are counted",
-    )
-    parser.add_argument(
-        "--output", type=Path, required=True, help="Output parquet path"
-    )
-    args = parser.parse_args()
-
-    open_urns: set[int] | None = None
-    if args.gias is not None:
-        gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
-        open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
-        print(f"GIAS open register: {len(open_urns):,} open school URNs")
-
-    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
-    if ofsted.is_empty():
-        raise ValueError("No good+ primary/secondary Ofsted schools found")
-
-    print(f"Good+ schools: {len(ofsted):,}")
-    print(
-        "Outstanding schools: "
-        f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
-    )
-
-    # Join with arcgis to get lat/lng for each school's postcode
-    arcgis = pl.read_parquet(args.arcgis).select(
-        pl.col("pcds").alias("postcode"),
-        "lat",
-        pl.col("long").alias("lng"),
-    )
-
-    schools = ofsted.join(arcgis, on="postcode", how="inner")
-    if schools.is_empty():
-        raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
-    print(f"Schools with coordinates: {len(schools):,}")
-
-    # Load all postcodes for proximity counting
-    postcodes = arcgis.rename({"lng": "lon"})
-
-    counts_5km = count_pois_per_postcode(
-        postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
-    )
-    counts_2km = count_pois_per_postcode(
-        postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
-    )
-
-    result = counts_5km.join(counts_2km, on="postcode")
-
-    args.output.parent.mkdir(parents=True, exist_ok=True)
-    result.write_parquet(args.output)
-    size_mb = args.output.stat().st_size / (1024 * 1024)
-    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
-
-
-if __name__ == "__main__":
-    main()
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -8,6 +8,7 @@ import polars as pl

 from pipeline.transform.join_epc_pp import (
    EPC_SOURCE_COLUMNS,
+    _join_address_parts,
    _run,
    _scan_epc_certificates,
 )
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
    assert df.schema["number_habitable_rooms"] == pl.Int16


+def test_join_address_parts_empty_string_components():
+    # Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
+    # concat_str(ignore_nulls=True) alone leaked the separator into the
+    # display address (' 10 PALACE GREEN') and doubled it for empty middle
+    # components. Empty/whitespace-only parts must contribute nothing.
+    df = pl.DataFrame(
+        {
+            "saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, "  ", " FLAT 2"],
+            "paon": ["10", "10", "", "82", "", None, "10", "11 "],
+            "street": [
+                "PALACE GREEN",
+                "HIGH STREET",
+                "HIGH STREET",
+                "",
+                "",
+                None,
+                "PALACE GREEN",
+                "STATION ROAD",
+            ],
+        }
+    )
+    out = df.select(
+        _join_address_parts("saon", "paon", "street").alias("address")
+    ).get_column("address")
+
+    assert out.to_list() == [
+        "10 PALACE GREEN",  # empty saon -> no leading space
+        "FLAT 1 10 HIGH STREET",  # normal three-part address is unchanged
+        "FLAT 1 HIGH STREET",  # empty middle component -> no double space
+        "FLAT 21 82",  # empty street -> no trailing space
+        None,  # all-empty -> null, not whitespace junk
+        None,  # all-null -> null
+        "10 PALACE GREEN",  # whitespace-only component treated as empty
+        "FLAT 2 11 STATION ROAD",  # per-component padding is stripped
+    ]
+    # Invariant: every produced address is trimmed and single-spaced.
+    produced = out.drop_nulls()
+    assert produced.str.starts_with(" ").sum() == 0
+    assert produced.str.ends_with(" ").sum() == 0
+    assert produced.str.contains("  ", literal=True).sum() == 0
+
+
+def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
+    # Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
+    # published pp_address must not inherit a leading separator from it.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [""],
+            "street": ["Example Street"],
+            "locality": [""],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+            "ppd_category": ["A"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # No leading space, and the clean address still matches its EPC record.
+    assert df.select("pp_address", "epc_address").to_dicts() == [
+        {"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
+    ]
+
+
 def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -304,7 +304,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=pl.LazyFrame(
@ -362,7 +362,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=broadband,
@ -1057,7 +1057,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=pl.LazyFrame(
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -1,9 +1,11 @@
 import polars as pl

 from pipeline.transform.poi_proximity import (
+    GREENSPACE_PARK_FUNCTIONS,
    POI_GROUPS_2KM,
    _build_poi_category_groups,
    _dynamic_poi_metric_renames,
+    _greenspace_count_frame,
    _groceries_categories,
 )
 from pipeline.utils.poi_counts import count_pois_per_postcode
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
        "parks_2km": "Number of amenities (Park) within 2km",
        "parks_5km": "Number of amenities (Park) within 5km",
    }
+
+
+def test_groceries_categories_exclude_speciality_food_retail() -> None:
+    """The static groceries metric must not count bakeries/butchers/delis/
+    off-licences (speciality retail, ~a third of the group), while keeping
+    Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
+    pois = pl.DataFrame(
+        {
+            "category": [
+                "Tesco",
+                "Supermarket",
+                "Convenience Store",
+                "Greengrocer",
+                "Bakery",
+                "Butcher & Fishmonger",
+                "Deli & Specialty",
+                "Off-Licence",
+                "Café",
+            ],
+            "group": ["Groceries"] * 8 + ["Leisure"],
+            "lat": [51.5] * 9,
+            "lng": [-0.1] * 9,
+        }
+    )
+
+    assert _groceries_categories(pois) == [
+        "Convenience Store",
+        "Greengrocer",
+        "Supermarket",
+        "Tesco",
+    ]
+
+
+def test_park_group_excludes_playgrounds_and_play_space() -> None:
+    # "Play Space" (playgrounds) must not count as a Park; Public Park Or
+    # Garden and Playing Field (open recreation grounds) are in scope.
+    assert GREENSPACE_PARK_FUNCTIONS == {
+        "parks": ["Public Park Or Garden", "Playing Field"]
+    }
+
+
+def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
+    # Three gates of one park (with a site centroid), one gate of another park
+    # without a centroid, and one centroid-fallback row with a null site_id.
+    greenspace = pl.DataFrame(
+        {
+            "lat": [51.50, 51.51, 51.52, 53.0, 54.0],
+            "lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
+            "category": ["Public Park Or Garden"] * 3
+            + ["Playing Field", "Public Park Or Garden"],
+            "site_id": ["site-a", "site-a", "site-a", "site-b", None],
+            "site_lat": [51.505, 51.505, 51.505, None, None],
+            "site_lng": [-0.105, -0.105, -0.105, None, None],
+        }
+    )
+
+    result = _greenspace_count_frame(greenspace).sort("lat")
+
+    # One row per site (site-a collapses 3 → 1), null-site rows preserved.
+    assert result.height == 3
+    site_a = result.filter(pl.col("site_id") == "site-a")
+    # The representative point is the site centroid…
+    assert site_a["lat"].to_list() == [51.505]
+    assert site_a["lng"].to_list() == [-0.105]
+    # …or the first access point when no centroid is available.
+    site_b = result.filter(pl.col("site_id") == "site-b")
+    assert site_b["lat"].to_list() == [53.0]
+
+
+def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
+    # The shipped parquet predates the site_id column; counting must not crash
+    # (it keeps the old access-point grain until regenerated).
+    legacy = pl.DataFrame(
+        {
+            "lat": [51.50, 51.51],
+            "lng": [-0.10, -0.11],
+            "category": ["Public Park Or Garden", "Play Space"],
+        }
+    )
+
+    assert _greenspace_count_frame(legacy).equals(legacy)
--- a/pipeline/transform/test_school_catchments.py
+++ b/pipeline/transform/test_school_catchments.py
@ -0,0 +1,354 @@
+import numpy as np
+import polars as pl
+
+from pipeline.transform.school_catchments import (
+    capacity_fill_radii,
+    children_per_postcode,
+    classify_good_plus_schools,
+    count_covering_catchments,
+    equilibrium_cutoffs,
+    phase_intakes,
+    school_preference_bonuses,
+)
+
+
+def _school(phase, oeif, ungraded, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": ungraded,
+    }
+
+
+def _classify(rows):
+    result = classify_good_plus_schools(pl.DataFrame(rows))
+    return {(r["urn"], r["category"]) for r in result.to_dicts()}
+
+
+def test_legacy_oeif_grades_1_and_2_are_kept():
+    rows = [
+        _school("Primary", "1", None, 1),
+        _school("Primary", "2", None, 2),
+        _school("Secondary", "1", None, 3),
+        _school("Secondary", "2", None, 4),
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_primary"),
+        (3, "outstanding_secondary"),
+        (4, "good_secondary"),
+    }
+
+
+def test_grades_3_and_4_are_excluded():
+    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_remains_good_is_recovered_when_no_graded_result():
+    # Null and "Not judged" OEIF fall back to the ungraded outcome.
+    rows = [
+        _school("Primary", None, "School remains Good", 1),
+        _school("Secondary", "Not judged", "School remains Outstanding", 2),
+        # "(Improving)" is still good+ ...
+        _school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
+    ]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (2, "outstanding_secondary"),
+        (3, "good_primary"),
+    }
+
+
+def test_ungraded_concerns_are_not_good_plus():
+    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
+    # must NOT be counted as good+ schools.
+    rows = [
+        _school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
+        _school(
+            "Secondary",
+            None,
+            "School remains Outstanding (Concerns) - S5 Next",
+            2,
+        ),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_non_good_outcomes_are_excluded():
+    rows = [
+        _school("Primary", None, "Some aspects not as strong"),
+        _school("Primary", None, "Standards maintained"),
+        _school("Primary", None, None),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
+    # A real grade 3 must not be promoted by an ungraded "remains Good".
+    rows = [_school("Primary", "3", "School remains Good")]
+    assert _classify(rows) == set()
+
+
+def test_non_primary_secondary_phases_excluded():
+    rows = [
+        _school("Nursery", "1", None),
+        _school("Not applicable", "2", None),
+    ]
+    assert _classify(rows) == set()
+
+
+def _aged_school(phase, oeif, low, high, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": None,
+        "Statutory lowest age": low,
+        "Statutory highest age": high,
+    }
+
+
+def test_all_through_school_counts_toward_both_primary_and_secondary():
+    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
+    # serves primary-age children too, so it must count in BOTH metrics.
+    rows = [_aged_school("Secondary", "2", 3, 18, 1)]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (1, "good_secondary"),
+    }
+
+
+def test_age_ranges_assign_single_phase_for_standard_schools():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 1),  # primary only
+        _aged_school("Secondary", "2", 11, 16, 2),  # secondary only
+        _aged_school("Secondary", "1", 9, 13, 3),  # middle -> both
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_secondary"),
+        (3, "outstanding_primary"),
+        (3, "outstanding_secondary"),
+    }
+
+
+def test_closed_schools_excluded_when_open_register_given():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 111),
+        _aged_school("Secondary", "2", 11, 16, 222),
+    ]
+    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
+    pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
+    # URN 222 is not in the open register, so it is dropped.
+    assert pairs == {(111, "outstanding_primary")}
+
+
+def _gias_row(
+    urn,
+    type_group="Academies",
+    age_range="4–11",
+    pupils=210,
+    capacity=None,
+    admissions_policy=None,
+):
+    return {
+        "urn": urn,
+        "name": f"School {urn}",
+        "lat": 51.5,
+        "lng": -0.1,
+        "type_group": type_group,
+        "age_range": age_range,
+        "pupils": pupils,
+        "capacity": capacity,
+        "admissions_policy": admissions_policy,
+    }
+
+
+def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                # 4-11 = cohorts 4..10, all 7 primary: full fill target.
+                _gias_row(1, age_range="4–11", pupils=210),
+                # 11-16 = cohorts 11..15, all 5 secondary.
+                _gias_row(2, age_range="11–16", pupils=500),
+                # 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
+                # gets 7 of 7.5 cohort weights.
+                _gias_row(3, age_range="3–11", pupils=240),
+                # All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
+                _gias_row(4, age_range="4–16", pupils=1200),
+                # 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
+                # secondary gets 5 of 6.2 cohort weights.
+                _gias_row(5, age_range="11–18", pupils=1240),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
+    assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
+
+
+def test_phase_intakes_excludes_non_state_and_selective_schools():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                _gias_row(1, type_group="Independent schools"),
+                _gias_row(2, type_group="Special schools"),
+                _gias_row(3, type_group="Welsh schools"),
+                # Grammar school intakes are test-based and region-wide; a
+                # distance catchment would be fabricated.
+                _gias_row(4, admissions_policy="Selective"),
+                _gias_row(5, pupils=None, capacity=300),
+                _gias_row(6, pupils=None, capacity=None),  # no usable headcount
+                _gias_row(7, age_range=None),  # no parsable cohorts
+                # Over-full school keeps its demonstrated size.
+                _gias_row(8, pupils=350, capacity=300),
+                _gias_row(9, admissions_policy="Non-selective"),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["urn"].to_list() == [5, 8, 9]
+    assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
+
+
+def test_school_preference_bonuses_follow_derived_grade():
+    rows = [
+        {**_school("Primary", "1", None, 1)},
+        {**_school("Primary", "2", None, 2)},
+        {**_school("Primary", "3", None, 3)},
+        {**_school("Primary", "4", None, 4)},
+        {**_school("Primary", None, "Some aspects not as strong", 5)},  # unrated
+        {**_school("Primary", "Not judged", "School remains Good", 6)},
+    ]
+    bonuses = dict(
+        school_preference_bonuses(
+            pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
+        ).iter_rows()
+    )
+    assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
+
+
+def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
+    postcodes = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
+            "lat": [51.5, 51.5, 52.0],
+            "lng": [-0.1, -0.1, -0.2],
+            "lsoa21cd": ["E01000001", "E01000001", "E01000002"],
+        }
+    )
+    lsoa_children = pl.DataFrame(
+        {
+            "lsoa21": ["E01000001", "E01000002"],
+            "aged_0_4": [100, 30],
+            "aged_5_9": [100, 10],
+            "aged_10_14": [100, 20],
+            "aged_15_19": [100, 40],
+        }
+    )
+    result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
+    # Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
+    # the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
+    assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
+    # Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
+    assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
+
+
+def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
+    # One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
+    # each. The two nearest postcodes exactly fill it, so the cutoff is the
+    # marginal admitted child's distance and the 3km postcode is shut out.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0]]),
+        np.array([10.0]),
+        np.array([0.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs.tolist() == [2.0]
+
+
+def test_equilibrium_rejected_demand_cascades_to_next_school():
+    # School A (5 places) at the origin, school B (5 places) at 10km.
+    # P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
+    # with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
+    # exceeds its target, so it keeps no binding cutoff.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [10.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0], [1.5, 0.0]]),
+        np.array([5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_preference_bonus_steers_demand_to_better_school():
+    # Two schools equidistant from the only postcode; school A is rated
+    # better (0.5km bonus) so all children choose it; B attracts nobody.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.5, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_logit_choice_smears_demand_across_schools():
+    # With a positive temperature some families prefer the further school, so
+    # both schools receive applications: the near school still fills and keeps
+    # a binding cutoff, and the far school now attracts mass it would never
+    # see under deterministic choice.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([4.0, 4.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=1.0,
+    )
+    # Each school gets half the 10 children (equidistant, equal utility),
+    # exceeding both fill targets: both cutoffs bind at the postcode.
+    assert cutoffs.tolist() == [1.0, 1.0]
+
+
+def test_capacity_fill_radii_covers_fill_target_population():
+    # Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
+    # cumulate past the target at 2km. A school needing more children than
+    # exist within the cap keeps the cap.
+    radii = capacity_fill_radii(
+        np.array([[0.0, 0.0], [0.0, 0.0]]),
+        np.array([6.0, 1000.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        max_radius_km=25.0,
+    )
+    assert radii.tolist() == [2.0, 25.0]
+
+
+def test_count_covering_catchments_respects_radius_and_validity():
+    pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
+    pc_valid = np.array([True, True, True, False])
+    school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
+    radii = np.array([4.0, 1.5])
+    counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
+    # pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
+    # pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
+    assert counts.tolist() == [1, 2, 0, 0]
+
+
+def test_count_covering_catchments_empty_schools():
+    counts = count_covering_catchments(
+        np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
+    )
+    assert counts.tolist() == [0, 0]
--- a/pipeline/transform/test_school_proximity.py
+++ b/pipeline/transform/test_school_proximity.py
@ -1,139 +0,0 @@
-import polars as pl
-
-from pipeline.transform.school_proximity import classify_good_plus_schools
-
-
-def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
-    return {
-        "Postcode": postcode,
-        "Ofsted phase": phase,
-        "Latest OEIF overall effectiveness": oeif,
-        "Ungraded inspection overall outcome": ungraded,
-    }
-
-
-def _classify(rows):
-    result = classify_good_plus_schools(pl.DataFrame(rows))
-    return {(r["postcode"], r["category"]) for r in result.to_dicts()}
-
-
-def test_legacy_oeif_grades_1_and_2_are_kept():
-    rows = [
-        _school("Primary", "1", None, "AA1 1AA"),
-        _school("Primary", "2", None, "AA1 1AB"),
-        _school("Secondary", "1", None, "AA1 1AC"),
-        _school("Secondary", "2", None, "AA1 1AD"),
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "outstanding_primary"),
-        ("AA1 1AB", "good_primary"),
-        ("AA1 1AC", "outstanding_secondary"),
-        ("AA1 1AD", "good_secondary"),
-    }
-
-
-def test_grades_3_and_4_are_excluded():
-    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
-    assert _classify(rows) == set()
-
-
-def test_ungraded_remains_good_is_recovered_when_no_graded_result():
-    # Null and "Not judged" OEIF fall back to the ungraded outcome.
-    rows = [
-        _school("Primary", None, "School remains Good", "AA1 1AA"),
-        _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
-        # "(Improving)" is still good+ ...
-        _school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AB", "outstanding_secondary"),
-        ("AA1 1AE", "good_primary"),
-    }
-
-
-def test_ungraded_concerns_are_not_good_plus():
-    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
-    # must NOT be counted as good+ schools.
-    rows = [
-        _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
-        _school(
-            "Secondary",
-            None,
-            "School remains Outstanding (Concerns) - S5 Next",
-            "AA1 1AD",
-        ),
-    ]
-    assert _classify(rows) == set()
-
-
-def test_ungraded_non_good_outcomes_are_excluded():
-    rows = [
-        _school("Primary", None, "Some aspects not as strong"),
-        _school("Primary", None, "Standards maintained"),
-        _school("Primary", None, None),
-    ]
-    assert _classify(rows) == set()
-
-
-def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
-    # A real grade 3 must not be promoted by an ungraded "remains Good".
-    rows = [_school("Primary", "3", "School remains Good")]
-    assert _classify(rows) == set()
-
-
-def test_non_primary_secondary_phases_excluded():
-    rows = [
-        _school("Nursery", "1", None),
-        _school("Not applicable", "2", None),
-    ]
-    assert _classify(rows) == set()
-
-
-def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
-    return {
-        "Postcode": postcode,
-        "Ofsted phase": phase,
-        "Latest OEIF overall effectiveness": oeif,
-        "Ungraded inspection overall outcome": None,
-        "URN": 100000,
-        "Statutory lowest age": low,
-        "Statutory highest age": high,
-    }
-
-
-def test_all_through_school_counts_toward_both_primary_and_secondary():
-    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
-    # serves primary-age children too, so it must count in BOTH metrics.
-    rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AA", "good_secondary"),
-    }
-
-
-def test_age_ranges_assign_single_phase_for_standard_schools():
-    rows = [
-        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),  # primary only
-        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),  # secondary only
-        _aged_school("Secondary", "1", 9, 13, "AA1 1AC"),  # middle -> both
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "outstanding_primary"),
-        ("AA1 1AB", "good_secondary"),
-        ("AA1 1AC", "outstanding_primary"),
-        ("AA1 1AC", "outstanding_secondary"),
-    }
-
-
-def test_closed_schools_excluded_when_open_register_given():
-    rows = [
-        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),
-        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
-    ]
-    rows[0]["URN"] = 111
-    rows[1]["URN"] = 222
-    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
-    pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
-    # URN 222 is not in the open register, so it is dropped.
-    assert pairs == {("AA1 1AA", "outstanding_primary")}
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
    assert n2_grocery.height == 1


+def test_transform_drops_miscategorised_tags(tmp_path):
+    # Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
+    # slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
+    # alternative medicine), Hospital & Clinic (untyped healthcare/yes),
+    # Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
+    # apparatus). They must be dropped entirely.
+    dropped = [
+        "amenity/bicycle_rental",
+        "amenity/boat_rental",
+        "leisure/marina",
+        "leisure/slipway",
+        "tourism/artwork",
+        "healthcare/yes",
+        "healthcare/alternative",
+        "shop/herbalist",
+        "shop/health",
+        "amenity/fountain",
+        "amenity/courthouse",
+        "leisure/fitness_station",
+    ]
+    raw = pl.DataFrame(
+        {
+            "id": [f"n{i}" for i in range(len(dropped))],
+            "name": [f"POI {i}" for i in range(len(dropped))],
+            "category": dropped,
+            "lat": [51.50] * len(dropped),
+            "lng": [-0.10] * len(dropped),
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
+
+
+def test_transform_splits_hospital_and_clinic(tmp_path):
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3"],
+            "name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
+            "category": [
+                "amenity/hospital",
+                "amenity/clinic",
+                "healthcare/clinic",
+            ],
+            "lat": [51.50, 51.51, 51.52],
+            "lng": [-0.10, -0.11, -0.12],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
+    assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
+    assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
+    assert "Hospital & Clinic" not in out["category"].to_list()
+
+
+def test_transform_maps_chalet_to_hotel(tmp_path):
+    # Holiday-let chalets are accommodation, not Tourist Attractions.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["Seaview Chalet"],
+            "category": ["tourism/chalet"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
+
+
+def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
+    # leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
+    # unnamed (anonymous tracks/gallops/fishing spots); only named public
+    # facilities survive as a Sports Centre.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3", "n4"],
+            "name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
+            "category": [
+                "leisure/track",
+                "leisure/fishing",
+                "leisure/track",
+                "leisure/horse_riding",
+            ],
+            "lat": [51.50, 51.51, 51.52, 51.53],
+            "lng": [-0.10, -0.11, -0.12, -0.13],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
+    named = out.filter(pl.col("id").is_in(["n3", "n4"]))
+    assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
+
+
+def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
+    # NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
+    # flow through with the Public Transport group and its own emoji.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["A Cafe"],
+            "category": ["amenity/cafe"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+    pl.DataFrame(
+        {
+            "id": ["naptan-1", "naptan-2"],
+            "name": ["Test Rail Station", "Weaste"],
+            "category": ["Rail station", "Tram & Metro stop"],
+            "lat": [51.51, 51.52],
+            "lng": [-0.13, -0.14],
+        }
+    ).write_parquet(inputs["naptan_path"])
+
+    out = transform(**inputs).collect()
+
+    tram = out.filter(pl.col("category") == "Tram & Metro stop")
+    assert tram.height == 1
+    assert tram["group"].to_list() == ["Public Transport"]
+    assert tram["emoji"].to_list() == ["🚊"]
+
+
 def test_transform_output_unique_per_id_category(tmp_path):
    # Soundness: the full transform() output has at most one row per
    # (id, category) overall, across every source.
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -86,6 +86,28 @@ DROP_CATEGORIES = {
    "amenity/water_point",
    "amenity/watering_place",
    "amenity/weighbridge",
+    # Boating/cycle-hire infrastructure formerly miscategorised as
+    # "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
+    # ramps and moorings are not entertainment venues.
+    "amenity/bicycle_rental",
+    "amenity/boat_rental",
+    "leisure/marina",
+    "leisure/slipway",
+    # Public art (statues, murals, village signs) formerly 93% of "Gallery".
+    "tourism/artwork",
+    # Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
+    # "Gym & Fitness".
+    "leisure/fitness_station",
+    # Untyped healthcare rows and non-pharmacy health shops formerly bucketed
+    # under "Hospital & Clinic" / "Pharmacy".
+    "healthcare/yes",
+    "healthcare/alternative",
+    "shop/herbalist",
+    "shop/health",
+    # Street fountains and courthouses formerly bucketed as
+    # "Tourist Attraction".
+    "amenity/fountain",
+    "amenity/courthouse",
    # Niche amenities not useful for home buyers
    "amenity/animal_boarding",
    "amenity/animal_breeding",
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/tanning_salon",
            "shop/amusements",
            "tourism/theme_park",
-            "amenity/bicycle_rental",
-            "amenity/boat_rental",
-            "leisure/marina",
-            "leisure/slipway",
+            # bicycle_rental/boat_rental/marina/slipway used to live here and
+            # made up ~46% of the bucket (cycle-hire docks, boat ramps); they
+            # are infrastructure, not entertainment venues — see DROP_CATEGORIES.
            "leisure/hackerspace",
            "leisure/yes",
        ],
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🏋️",
        [
            "leisure/fitness_centre",
-            "leisure/fitness_station",
+            # leisure/fitness_station (outdoor pull-up bars / trim-trail
+            # apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
            "amenity/dojo",
            "amenity/dancing_school",
        ],
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "amenity/pharmacy",
            "healthcare/pharmacy",
            "shop/chemist",
-            "shop/herbalist",
-            "shop/health",
-            "healthcare/alternative",
+            # healthcare/alternative, shop/herbalist and shop/health (homeopaths,
+            # herbalists, generic "health" shops) are not dispensing pharmacies
+            # — see DROP_CATEGORIES.
+        ],
+    ),
+    # "Hospital & Clinic" used to be one bucket; an actual hospital and a small
+    # clinic are very different amenities for a homebuyer, so they are split.
+    (
+        "Health",
+        "Hospital",
+        "🏥",
+        [
+            "amenity/hospital",
+            "healthcare/hospital",
        ],
    ),
    (
        "Health",
-        "Hospital & Clinic",
-        "🏥",
+        "Clinic",
+        "🩺",
        [
-            "amenity/hospital",
            "amenity/clinic",
            "amenity/health_centre",
            "healthcare/blood_donation",
-            "healthcare/hospital",
            "healthcare/centre",
            "healthcare/clinic",
            "office/healthcare",
            "healthcare/laboratory",
            "healthcare/rehabilitation",
            "healthcare/vaccination_centre",
-            "healthcare/yes",
+            # healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
        ],
    ),
    (
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🖼️",
        [
            "tourism/gallery",
-            "tourism/artwork",
+            # tourism/artwork (statues, murals, village signs) was 93% of this
+            # bucket and is not a visitable gallery — see DROP_CATEGORIES.
        ],
    ),
    (
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "tourism/attraction",
            "tourism/aquarium",
-            "amenity/fountain",
-            "amenity/courthouse",
-            "tourism/chalet",
+            # amenity/fountain (street furniture) and amenity/courthouse are
+            # dropped; tourism/chalet (holiday lets) moved to "Hotel".
        ],
    ),
    # Note: schools come from the GIAS register (see transform_gias_schools).
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/resort",
            "tourism/holiday_park",
            "tourism/self_catering",
+            # Holiday-let chalets are accommodation, not tourist attractions
+            # (where they previously sat).
+            "tourism/chalet",
        ],
    ),
    (
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
    "leisure/practice_pitch",
    "leisure/swimming_pool",
    "leisure/paddling_pool",
+    # 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
+    # fishing spots; only named public facilities count as a Sports Centre.
+    "leisure/track",
+    "leisure/horse_riding",
+    "leisure/fishing",
 }


@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
    "Bus station": "🚌",
    "Taxi rank": "🚕",
    "Tube station": "🚇",
+    "Tram & Metro stop": "🚊",
 }


@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    (null/"Not judged", e.g. schools last seen under the post-2024 ungraded
    report-card framework) we fall back to "Ungraded inspection overall outcome"
    so genuinely good/outstanding schools aren't dropped — mirroring
-    school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
+    school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
    grade_col = pl.col("Latest OEIF overall effectiveness")
-    # See school_proximity: the ungraded outcome carries "School remains Good"/
+    # See school_catchments: the ungraded outcome carries "School remains Good"/
    # "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
    # suffixes) when the graded column is null/"Not judged".
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)