Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -273,27 +273,24 @@ def _write_avg_yr(
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

-    # Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
-    # bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
-    # average over the years in which ANY of those types occurred. This keeps the
-    # headline equal to the mean of the "Serious/Minor crime (by year)" bars.
-    # Summing the per-type avg/yr values instead (as the merge previously did)
-    # divides each type by its OWN years-present and overstates the rollup when a
-    # postcode's serious/minor types occur in disjoint years.
+    # Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
+    # columns, so each rollup always equals the sum of the parts shown beside it
+    # and can never fall below one of its own components. (Previously the rollup
+    # re-derived a union-years-present mean: it divided the summed counts by the
+    # number of years in which ANY component type occurred, whereas each
+    # component divides by its OWN years-present. When a postcode's serious/minor
+    # types occurred in disjoint years the union denominator was larger, so the
+    # rollup came out smaller than the sum of its parts.) The by-year rollup
+    # series in _write_by_year is likewise the per-year sum of the component
+    # bars, so headline and chart both present the rollup as the sum of its parts.
    for rollup_name, rollup_types in (
        ("Serious crime", SERIOUS_CRIME_TYPES),
        ("Minor crime", MINOR_CRIME_TYPES),
    ):
        rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
-        rollup_counts = counts[:, rollup_idx, :].sum(axis=1)  # (n_postcodes, n_years)
-        rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
-        rollup_years_present = np.clip(
-            (rollup_counts > 0).sum(axis=1), 1, None
-        ).astype(np.float64)
-        rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
-        data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
-            np.float32
-        )
+        data[f"{rollup_name} (avg/yr)"] = np.round(
+            avg[:, rollup_idx].sum(axis=1), 1
+        ).astype(np.float32)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -36,6 +36,16 @@ MIN_PRICE = 10_000
 MIN_BUILD_YEAR = 1700
 MAX_BUILD_YEAR = 2030

+# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
+# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
+# habitable rooms) that otherwise propagate verbatim into the published per-
+# property columns. Values outside these bands are nulled (treated as unknown)
+# rather than shown. Bounds are deliberately wide so only clear errors are cut.
+MIN_FLOOR_HEIGHT_M = 1.5  # below this a storey is not habitable
+MAX_FLOOR_HEIGHT_M = 6.0  # above this is a data error, not a normal storey
+MAX_TOTAL_FLOOR_AREA_M2 = 2000.0  # ~21,500 sqft; larger is a bulk/garbage record
+MAX_HABITABLE_ROOMS = 20  # dwellings above this are data errors
+

 def epc_band_to_year(band: pl.Expr) -> pl.Expr:
    """Map an EPC construction age band to a single representative build year.
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
        )
        .filter(pl.col("epc_address").is_not_null())
        .with_columns(
-            pl.when(pl.col("number_habitable_rooms") == 0)
-            .then(None)
-            .otherwise(pl.col("number_habitable_rooms"))
+            # Null implausible EPC dimensions so data-entry errors don't reach
+            # the published per-property columns (Interior height, Total floor
+            # area, Number of bedrooms & living rooms). Treated as unknown.
+            pl.when(
+                (pl.col("number_habitable_rooms") >= 1)
+                & (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
+            )
+            .then(pl.col("number_habitable_rooms"))
+            .otherwise(None)
            .alias("number_habitable_rooms"),
+            pl.when(
+                pl.col("floor_height").is_between(
+                    MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
+                )
+            )
+            .then(pl.col("floor_height"))
+            .otherwise(None)
+            .alias("floor_height"),
+            pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
+            .then(pl.col("total_floor_area"))
+            .otherwise(None)
+            .alias("total_floor_area"),
        )
    )

--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -2,6 +2,7 @@ import argparse
 import re
 import tempfile
 from dataclasses import dataclass
+from datetime import date
 from typing import Literal

 import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
 CONSERVATION_AREA_FEATURE = "Within conservation area"
-TREE_DENSITY_FEATURE = "Street tree density percentile"
+# Named "Tree canopy" (not "Street tree") because the underlying density unions
+# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
+# woodland-edge postcode's score reflects forest canopy, not only street trees.
+TREE_DENSITY_FEATURE = "Tree canopy density percentile"
 LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:


 def _is_current_planning_record(end_date: object) -> bool:
+    """A planning record is current when it has no end-date OR its end-date is
+    still in the future. The planning.data.gov.uk `end-date` field marks when a
+    designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
+    area and must NOT be dropped — the previous "any non-empty date = ended"
+    logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
    if end_date is None:
        return True
    if isinstance(end_date, str):
-        return end_date.strip() == ""
+        text = end_date.strip()
+        if text == "":
+            return True
+        try:
+            return date.fromisoformat(text[:10]) > date.today()
+        except ValueError:
+            # Unparseable end-date: keep the record rather than silently drop it.
+            return True
    return False


@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
    )


+def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
+    """Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
+
+    Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
+    like median age and IoD. The IoD table defines the LSOA universe every
+    postcode resolves into, so a missing LSOA would silently null the ethnicity
+    columns for those postcodes; require full coverage instead.
+    """
+    iod_lsoas = pl.read_parquet(
+        iod_path, columns=["LSOA code (2021)"]
+    ).rename({"LSOA code (2021)": "lsoa21"})
+
+    ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
+    missing_ethnicity = iod_lsoas.join(
+        ethnicity_lsoas, on="lsoa21", how="anti"
+    ).sort("lsoa21")
+    if missing_ethnicity.height > 0:
+        raise ValueError(
+            "Ethnicity data is missing LSOA coverage: "
+            f"{missing_ethnicity.height} LSOAs, e.g. "
+            f"{missing_ethnicity.head(10).to_dicts()}"
+        )
+
+
 def _validate_lad_source_coverage(
-    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
+    iod_path: Path, rental_prices_path: Path
 ) -> None:
    iod_lads = (
        pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
        .unique(["lad"])
    )

-    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
-        {"Geography_code": "lad"}
-    )
-    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
-    if missing_ethnicity.height > 0:
-        raise ValueError(
-            "Ethnicity data is missing 2024 LAD coverage: "
-            f"{missing_ethnicity.to_dicts()}"
-        )
-
    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
        {"area_code": "lad"}
    )
@ -849,12 +879,10 @@ def _join_area_side_tables(
    broadband: pl.LazyFrame,
 ) -> pl.LazyFrame:
    base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
-    base = base.join(
-        ethnicity,
-        left_on="Local Authority District code (2024)",
-        right_on="Geography_code",
-        how="left",
-    )
+    # Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
+    # `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
+    # Local-Authority broadcast, with no change to the 6-bucket output schema.
+    base = base.join(ethnicity, on="lsoa21", how="left")

    # Crime is counted spatially per postcode (incidents within 50m of the
    # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
    """
    if mode == "listings" and actual_listings_path is None:
        raise ValueError("listings mode requires actual_listings_path")
-    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
+    _validate_lsoa_source_coverage(iod_path, ethnicity_path)
+    _validate_lad_source_coverage(iod_path, rental_prices_path)

    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
        "--ethnicity",
        type=Path,
        required=True,
-        help="Ethnicity by local authority parquet file (optional)",
+        help="Census 2021 ethnic group (TS021) by LSOA parquet file",
    )
    parser.add_argument(
        "--crime",
--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
 # tolerance), we fatten it just enough to survive snapping rather than drop it.
 _MIN_FOOTPRINT_BUFFER_M = 0.5

+# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
+# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
+# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
+# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
+# building-scale footprint instead. (Genuine thin slivers, which still carry
+# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
+# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
+# falls through to the tiny _grid_footprint, so this can only improve the result.
+_POINT_RESCUE_BUFFER_M = 8.0
+_POINTLIKE_AREA_M2 = 1.0
+_POINTLIKE_PERIMETER_M = 4.0
+

 def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
    """Transform a BNG polygon to WGS84, snap to output precision, validate.
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:


 def _rescue_footprint(geom_bng) -> dict | None:
-    """Fatten a degenerate BNG geometry into a representable footprint and snap."""
-    footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
+    """Fatten a degenerate BNG geometry into a representable footprint and snap.
+
+    A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon — the
+    signature of a tower-block postcode whose UPRNs all share one coordinate)
+    gets a building-scale buffer so it is not reduced to an invisible sub-metre
+    dot; thin slivers that still carry length keep the minimal buffer.
+    """
+    buffer_m = _MIN_FOOTPRINT_BUFFER_M
+    try:
+        if (
+            geom_bng.area < _POINTLIKE_AREA_M2
+            and geom_bng.length < _POINTLIKE_PERIMETER_M
+        ):
+            buffer_m = _POINT_RESCUE_BUFFER_M
+    except GEOSException:
+        pass
+    footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
    if footprint is None:
        return None
    return _snap_to_wgs84_geojson(footprint)
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -906,6 +906,37 @@ class TestToWgs84Geojson:
        assert result is not None
        assert result["type"] == "Polygon"

+    def test_pointlike_input_gets_building_scale_footprint(self):
+        """A tower-block postcode (all UPRNs at one point) must not collapse to a
+        sub-metre dot; it gets a building-scale footprint instead."""
+        import pyproj
+        from shapely.geometry import Point, shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        result = to_wgs84_geojson(Point(360000, 170000))
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
+
+    def test_thin_sliver_keeps_minimal_buffer(self):
+        """A genuine elongated sliver still carries length, so it is NOT inflated
+        to building scale — only truly pointlike inputs are."""
+        import pyproj
+        from shapely.geometry import LineString, shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
+        result = to_wgs84_geojson(sliver)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
+
    def test_coordinates_have_limited_precision(self):
        """GeoJSON coordinates should be rounded to 6 decimal places."""
        import json
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -230,11 +230,28 @@ def main():
    ).height
    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")

+    # Null the absolute "Estimated current price" itself when its implied
+    # per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
+    # AND the floor area is known: these come from bulk/block transfers or
+    # garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
+    # estimate) and are not meaningful single-dwelling values. Previously only
+    # the derived per-sqm was nulled, leaving the absurd headline price visible.
+    _raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
+    df = df.with_columns(
+        pl.when(
+            pl.col("Estimated current price").is_not_null()
+            & pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
+        )
+        .then(None)
+        .otherwise(pl.col("Estimated current price"))
+        .alias("Estimated current price"),
+    )
+
    # Derive estimated price per sqm where both estimated price and floor area
-    # exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
-    # MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
-    # from bulk/block transactions or floor-area errors and are not meaningful
-    # per-unit prices.
+    # exist. Now that the implausible-psm estimates are nulled above, the band
+    # filter here mainly guards the floor-area>0 case.
    _est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
    df = df.with_columns(
        pl.when(
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
+    LATEST_COMPLETE_YEAR,
    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
    """
-    pairs = extract_pairs(input_path, max_year2=max_pair_year)
+    # Solve the index only on COMPLETE calendar years: exclude the partial
+    # current year, whose thin repeat-sale set yields wild betas. The index is
+    # still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
+    # follows the established trend rather than a partial-year spike. Backtest
+    # passes a stricter max_pair_year, which is honoured.
+    estimation_cap = (
+        max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
+    )
+    pairs = extract_pairs(input_path, max_year2=estimation_cap)
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
    max_year = CURRENT_YEAR

    hedonic_idx = compute_hedonic_index(
-        input_path, min_year, max_year, max_sale_year=max_pair_year
+        input_path, min_year, max_year, max_sale_year=estimation_cap
    )

    # Precompute hierarchy
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -6,6 +6,13 @@ import numpy as np
 import polars as pl

 CURRENT_YEAR = 2026
+# Latest COMPLETE calendar year. The current year's transactions are only
+# partially reported (Land Registry lags ~2-3 months), so a sector's thin
+# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
+# single sector). The index is SOLVED only on complete years (<= this) and
+# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
+# projections follow the established trend instead of a partial-year spike.
+LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
 _today = date.today()
 CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12

--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
 }


-def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
+# Age thresholds for deciding which phase(s) a school serves. A school serves
+# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
+# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
+# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
+# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
+# phase" labels such schools as just "Secondary", which previously hid them from
+# every postcode's primary-school count.
+PRIMARY_MAX_AGE = 10
+SECONDARY_MIN_AGE = 12
+
+
+def classify_good_plus_schools(
+    ofsted: pl.DataFrame, open_urns: set[int] | None = None
+) -> pl.DataFrame:
    """Label good+/outstanding primary & secondary schools for proximity counts.

-    Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
-    returning a ``(postcode, category)`` frame.
+    Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
+    ``category`` rows per school, returning a ``(postcode, category)`` frame.

    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
    that column is null/"Not judged" for them even when they are demonstrably
    good — their status lives in "Ungraded inspection overall outcome" ("School
-    remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
-    variants). Filtering on the graded column alone dropped ~7,000 genuinely
-    good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
-    there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
-    is never overridden.
+    remains Good"/"School remains Outstanding"). Filtering on the graded column
+    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
+    ungraded outcome, but ONLY when there is no usable graded result
+    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
+
+    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
+    (Concerns)" outcome signals inspectors found issues warranting an earlier
+    graded re-inspection, so marketing it as a good+ school is misleading.
+
+    Phase assignment uses the statutory age range when available (so all-through
+    and middle schools count toward BOTH primary and secondary), falling back to
+    the coarse "Ofsted phase" label when age columns are absent. When
+    ``open_urns`` is given, schools whose URN is not in the current GIAS open
+    register are dropped so closed/merged schools are not counted.
    """
    # Cast to Utf8 so the string predicates below are well-defined even if a
    # column happens to be entirely null (read back as a Null dtype).
    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    has_concern = ungraded.str.contains(r"\(Concerns\)")
+    remains_outstanding = (
+        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
+    )
+    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
    graded = (
        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
        .with_columns(
            pl.when(oeif.is_in(["1", "2"]))
            .then(oeif)
-            .when(
-                no_usable_grade
-                & ungraded.str.starts_with("School remains Outstanding")
-            )
+            .when(no_usable_grade & remains_outstanding)
            .then(pl.lit("1"))
-            .when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
+            .when(no_usable_grade & remains_good)
            .then(pl.lit("2"))
            .otherwise(None)
            .alias("_ofsted_grade")
        )
        .filter(pl.col("_ofsted_grade").is_not_null())
    )
+
+    # Drop schools no longer open (closed/merged) when the GIAS open register is
+    # provided, so stale Ofsted "latest inspection" rows are not counted.
+    if open_urns is not None and "URN" in graded.columns:
+        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
+
+    # Decide which phase(s) each school serves.
+    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
+        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
+        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
+        serves_primary = (
+            pl.when(low.is_not_null())
+            .then(low <= PRIMARY_MAX_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Primary")
+        )
+        serves_secondary = (
+            pl.when(high.is_not_null())
+            .then(high >= SECONDARY_MIN_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Secondary")
+        )
+    else:
+        serves_primary = pl.col("Ofsted phase") == "Primary"
+        serves_secondary = pl.col("Ofsted phase") == "Secondary"
+
+    graded = graded.with_columns(
+        serves_primary.alias("_serves_primary"),
+        serves_secondary.alias("_serves_secondary"),
+    )
+
    # Good+ groups include both grade variants; outstanding groups count grade 1.
-    return graded.with_columns(
-        pl.when(pl.col("Ofsted phase") == "Primary")
-        .then(
-            pl.when(pl.col("_ofsted_grade") == "1")
-            .then(pl.lit("outstanding_primary"))
-            .otherwise(pl.lit("good_primary"))
-        )
-        .otherwise(
-            pl.when(pl.col("_ofsted_grade") == "1")
-            .then(pl.lit("outstanding_secondary"))
-            .otherwise(pl.lit("good_secondary"))
-        )
+    # A school can yield up to two rows (primary and secondary).
+    primary = graded.filter(pl.col("_serves_primary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_primary"))
+        .otherwise(pl.lit("good_primary"))
        .alias("category")
-    ).select(
+    )
+    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_secondary"))
+        .otherwise(pl.lit("good_secondary"))
+        .alias("category")
+    )
+    return pl.concat([primary, secondary]).select(
        pl.col("Postcode").alias("postcode"),
        "category",
    )
@ -85,12 +138,24 @@ def main():
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
    )
+    parser.add_argument(
+        "--gias",
+        type=Path,
+        default=None,
+        help="GIAS open-school parquet; if given, only currently-open schools are counted",
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet path"
    )
    args = parser.parse_args()

-    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
+    open_urns: set[int] | None = None
+    if args.gias is not None:
+        gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
+        open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
+        print(f"GIAS open register: {len(open_urns):,} open school URNs")
+
+    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
    if ofsted.is_empty():
        raise ValueError("No good+ primary/secondary Ofsted schools found")

--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
    assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}


-def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
+def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
    # Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
    # 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
-    # "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
-    # bars (which span the UNION of years any serious type occurred), NOT the sum
-    # of the per-type means. Summing per-type means divides each type by its OWN
-    # years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
-    # per-year serious total by the years any serious type occurred (2) -> 12.
+    # "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
+    # (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
+    # shown beside it and can never fall below a single component. (The previous
+    # union-years-present mean would have divided the per-year serious total by the
+    # 2 years any serious type occurred, giving a misleading 12 that sits below
+    # both the burglary and robbery rollup contributions.)
    units = tmp_path / "units"
    _write_boundaries(
        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)

    avg = pl.read_parquet(output).row(0, named=True)
-    # The precomputed rollup headline exists and equals the mean of the bars (12),
-    # not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
    assert "Serious crime (avg/yr)" in avg
    assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
    assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+    # Rollup == sum of its component (avg/yr) columns.
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(
+        avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
+    )

+    # The by-year rollup series remains the per-year sum of the component bars.
    serious_bars = {
        p["year"]: p["count"]
        for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
        2014: pytest.approx(12.0, abs=0.05),
        2024: pytest.approx(12.0, abs=0.05),
    }
-    mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)


 def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -34,6 +34,7 @@ from pipeline.transform.merge import (
    _split_normal_outputs,
    _tree_density_by_postcode,
    _validate_lad_source_coverage,
+    _validate_lsoa_source_coverage,
    _validate_postcode_feature_output,
    _validate_property_postcodes,
 )
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
    tmp_path,
 ) -> None:
    iod_path = tmp_path / "iod.parquet"
-    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
            ],
        }
    ).write_parquet(iod_path)
-    pl.DataFrame(
-        {"Geography_code": ["E08000016", "E06000053", "E09000001"]}
-    ).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

-    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+    _validate_lad_source_coverage(iod_path, rental_path)


 def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
    iod_path = tmp_path / "iod.parquet"
-    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
            "Local Authority District name (2024)": ["Barnsley"],
        }
    ).write_parquet(iod_path)
-    pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

    with pytest.raises(ValueError, match="Rental data is missing"):
-        _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+        _validate_lad_source_coverage(iod_path, rental_path)
+
+
+def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
+    tmp_path,
+) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
+        iod_path
+    )
+    # Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
+    # LSOAs are required to all be present.
+    pl.DataFrame(
+        {"lsoa21": ["E01000001", "E01000002", "E01000003"]}
+    ).write_parquet(ethnicity_path)
+
+    _validate_lsoa_source_coverage(iod_path, ethnicity_path)
+
+
+def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
+        iod_path
+    )
+    pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
+
+    with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
+        _validate_lsoa_source_coverage(iod_path, ethnicity_path)


 def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
            "Property type": ["Terraced", None],
            "Leasehold/Freehold": ["Leasehold", None],
            "Last known price": [500_000, None],
-            "Street tree density percentile": [42.0, 42.0],
+            "Tree canopy density percentile": [42.0, 42.0],
            # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
            "_actual_listing_url": ["url0", "url1"],
            "_actual_asking_price": [600_000, 700_000],
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
-            "Street tree density percentile": pl.Float32,
+            "Tree canopy density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
    assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
    assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
    # Postcode-level feature carried through to both matched and unmatched rows.
-    assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
+    assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
    # Match status reflects historical context availability.
    assert finalized["Historical property match status"].to_list() == [
        "matched",
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
            "Property type": ["Terraced", "Terraced"],
            "Leasehold/Freehold": ["Leasehold", "Leasehold"],
            "Last known price": [500_000, 480_000],
-            "Street tree density percentile": [42.0, 42.0],
+            "Tree canopy density percentile": [42.0, 42.0],
            # Same listing URL on both collapsed rows — the fan-out to fix.
            "_actual_listing_url": ["url0", "url0"],
            "_actual_asking_price": [600_000, 600_000],
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
-            "Street tree density percentile": pl.Float32,
+            "Tree canopy density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
--- a/pipeline/transform/test_school_proximity.py
+++ b/pipeline/transform/test_school_proximity.py
@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
    rows = [
        _school("Primary", None, "School remains Good", "AA1 1AA"),
        _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
-        # "(Concerns)"/"(Improving)" variants are still good+.
+        # "(Improving)" is still good+ ...
+        _school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "good_primary"),
+        ("AA1 1AB", "outstanding_secondary"),
+        ("AA1 1AE", "good_primary"),
+    }
+
+
+def test_ungraded_concerns_are_not_good_plus():
+    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
+    # must NOT be counted as good+ schools.
+    rows = [
        _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
        _school(
            "Secondary",
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
            "AA1 1AD",
        ),
    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AB", "outstanding_secondary"),
-        ("AA1 1AC", "good_primary"),
-        ("AA1 1AD", "outstanding_secondary"),
-    }
+    assert _classify(rows) == set()


 def test_ungraded_non_good_outcomes_are_excluded():
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
        _school("Not applicable", "2", None),
    ]
    assert _classify(rows) == set()
+
+
+def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
+    return {
+        "Postcode": postcode,
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": None,
+        "URN": 100000,
+        "Statutory lowest age": low,
+        "Statutory highest age": high,
+    }
+
+
+def test_all_through_school_counts_toward_both_primary_and_secondary():
+    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
+    # serves primary-age children too, so it must count in BOTH metrics.
+    rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
+    assert _classify(rows) == {
+        ("AA1 1AA", "good_primary"),
+        ("AA1 1AA", "good_secondary"),
+    }
+
+
+def test_age_ranges_assign_single_phase_for_standard_schools():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),  # primary only
+        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),  # secondary only
+        _aged_school("Secondary", "1", 9, 13, "AA1 1AC"),  # middle -> both
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "outstanding_primary"),
+        ("AA1 1AB", "good_secondary"),
+        ("AA1 1AC", "outstanding_primary"),
+        ("AA1 1AC", "outstanding_secondary"),
+    }
+
+
+def test_closed_schools_excluded_when_open_register_given():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),
+        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
+    ]
+    rows[0]["URN"] = 111
+    rows[1]["URN"] = 222
+    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
+    pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
+    # URN 222 is not in the open register, so it is dropped.
+    assert pairs == {("AA1 1AA", "outstanding_primary")}
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
    "emergency/water_tank",
    "leisure/bleachers",
    "leisure/schoolyard",
+    # Park "furniture" / incidental features — not parks; they massively
+    # inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
+    "leisure/bandstand",
+    "leisure/bird_hide",
+    "leisure/firepit",
+    "leisure/outdoor_seating",
+    "leisure/picnic_table",
+    "leisure/wildlife_hide",
    "public_transport/pay_scale_area",
    "shop/taxi",
    "amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
    "tourism/village_sign",
    "tourism/wilderness_hut",
    "tourism/yes",
-    # Public transport (from NaPTAN instead)
+    # Public transport (from NaPTAN instead). public_transport/platform is the
+    # EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
+    # authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
+    # NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
+    # a single stop. stop_position is left dropped to avoid double-counting the
+    # same stop (platform + stop_position).
    "public_transport/entrance",
-    "public_transport/platform",
    "public_transport/station",
    "public_transport/stop_position",
    # Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🌳",
        [
            "leisure/park",
+            # leisure/garden is dominated by private residential gardens (98%+
+            # unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
+            # so only named (public/notable) gardens count as a Park.
            "leisure/garden",
            "leisure/common",
            "leisure/nature_reserve",
            "leisure/dog_park",
-            "leisure/bandstand",
-            "leisure/bird_hide",
-            "leisure/firepit",
-            "leisure/outdoor_seating",
-            "leisure/picnic_table",
-            "leisure/wildlife_hide",
        ],
    ),
    (
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "leisure/sports_centre",
            "leisure/sports_hall",
+            # leisure/pitch (73% of the old bucket) and leisure/swimming_pool
+            # (98% unnamed = private/garden pools) are name-gated in transform()
+            # via REQUIRE_NAME_CATEGORIES so only named public facilities count.
            "leisure/pitch",
            "leisure/track",
            "leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "amenity/townhall",
        ],
    ),
+    # ── Public transport (OSM supplement to NaPTAN) ──────────
+    # OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
+    # / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
+    # one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
+    # transform() (osm_stops_near_naptan).
+    (
+        "Public Transport",
+        "Bus stop",
+        "🚏",
+        [
+            "public_transport/platform",
+        ],
+    ),
 ]

+# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
+# These tags are overwhelmingly private/incidental when unnamed: a nameless
+# `leisure/garden` is a private residential garden (not a public park), and a
+# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
+# Keeping only named instances stops them inflating Park / Sports Centre counts
+# while preserving genuinely public, notable facilities (which carry a name).
+REQUIRE_NAME_CATEGORIES = {
+    "leisure/garden",
+    "leisure/pitch",
+    "leisure/practice_pitch",
+    "leisure/swimming_pool",
+    "leisure/paddling_pool",
+}
+
+
 # Build flat lookup: OSM category → (group, friendly_name, emoji)
 CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
    osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    )


-def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
+def transform_gias_schools(
+    gias_path: Path, ofsted_path: Path, boundary_path: Path
+) -> pl.LazyFrame:
    """Convert the GIAS register parquet into POI rows with school metadata.
    Ofsted ratings are joined by URN so each school carries its latest OEIF
    overall effectiveness grade (Outstanding/Good/Requires improvement/
-    Inadequate/Not judged), surfaced in the map popup."""
+    Inadequate/Not judged), surfaced in the map popup.
+
+    Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
+    GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
+    England-only Education layer (and depress apparent Ofsted coverage, since
+    Wales is inspected by Estyn, not Ofsted)."""
    icon_category_expr = _school_icon_category_expr()
    emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
    ofsted = _load_ofsted_ratings(ofsted_path)
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
-    return (
+    schools = (
        pl.scan_parquet(gias_path)
        .join(ofsted, on="urn", how="left")
        .select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
            pl.col("head_name").alias("school_head_name"),
            pl.col("ofsted_rating").alias("school_ofsted_rating"),
        )
+        .collect()
    )
+    mask = in_england_mask(
+        boundary_path,
+        schools["lat"].to_numpy(),
+        schools["lng"].to_numpy(),
+    )
+    return schools.filter(pl.Series(mask)).lazy()


 # OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
    return tokens


+# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
+# gaps. Where NaPTAN already has a stop within this radius the area is covered,
+# so the colocated OSM platform is dropped to avoid double-counting; OSM
+# platforms with no nearby NaPTAN stop (the gaps) are kept.
+BUS_STOP_DEDUP_RADIUS_M = 50.0
+
+
+def osm_stops_near_naptan(
+    osm_stops: pl.DataFrame,
+    naptan_stops: pl.DataFrame,
+    radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
+) -> list[str]:
+    """Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
+
+    Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
+    a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
+    NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
+    """
+    if osm_stops.is_empty() or naptan_stops.is_empty():
+        return []
+
+    from scipy.spatial import cKDTree
+
+    n_lat = naptan_stops["lat"].to_numpy().astype(float)
+    n_lng = naptan_stops["lng"].to_numpy().astype(float)
+    o_lat = osm_stops["lat"].to_numpy().astype(float)
+    o_lng = osm_stops["lng"].to_numpy().astype(float)
+    o_ids = osm_stops["id"].to_list()
+
+    mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
+    cos_lat = float(np.cos(np.radians(mean_lat)))
+    n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
+    o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
+
+    tree = cKDTree(n_xy)
+    dist, _ = tree.query(o_xy, k=1)
+    return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
+
+
 def osm_groceries_colocated_with_geolytix(
    osm_groceries: pl.DataFrame,
    geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
    # Drop unwanted categories
    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))

+    # Drop UNNAMED instances of private-dominated tags (gardens, pitches,
+    # pools) so they don't inflate Park / Sports Centre proximity counts. Done
+    # while `category` still holds the raw OSM key, before the friendly mapping.
+    lf = lf.filter(
+        ~(
+            pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
+            & (
+                pl.col("name").is_null()
+                | (pl.col("name").cast(pl.String).str.strip_chars() == "")
+            )
+        )
+    )
+
    # Build lookup expressions from the 3-tuple mapping
    group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
    name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
        )

+    # Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
+    # supplement only adds stops in NaPTAN's coverage gaps (no double-count in
+    # covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
+    # with NaPTAN ATCO ids.
+    osm_bus_stops = (
+        lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
+        .select("id", "lat", "lng")
+        .collect(engine="streaming")
+    )
+    naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
+    covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
+    kept_osm = osm_bus_stops.height - len(covered_bus_ids)
+    print(
+        f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
+        f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
+        f"{kept_osm:,} to fill NaPTAN gaps"
+    )
+    if covered_bus_ids:
+        lf = lf.filter(
+            ~(
+                (pl.col("group") == "Public Transport")
+                & (pl.col("category") == "Bus stop")
+                & pl.col("id").is_in(covered_bus_ids)
+            )
+        )
+
    frames = [
        lf,
        naptan,
        grocery_pois.lazy(),
-        transform_gias_schools(gias_path, ofsted_path),
+        transform_gias_schools(gias_path, ofsted_path, boundary_path),
    ]

    return pl.concat(frames, how="diagonal_relaxed")