Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@ -1,4 +1,22 @@
+"""Download Census 2021 ethnic group (TS021) by LSOA.
+
+Downloads the 20-category ethnic-group breakdown (TS021, classification
+C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
+leaf categories into our 6 output buckets, and emits one row per LSOA with the
+percentage in each bucket.
+
+Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
+~100x granularity gain with no change to the 6-bucket output schema: two very
+different neighbourhoods in one borough no longer share an identical ethnicity
+profile. The join key downstream (merge.py) is `lsoa21`, the same key already
+used for median age and IoD.
+
+Source: NOMIS (ONS Census 2021 — TS021 dataset, NM_2041_1)
+License: Open Government Licence v3.0
+"""
+
 import argparse
+from io import BytesIO
 from pathlib import Path

 import httpx
@ -6,143 +24,168 @@ import polars as pl

 pl.Config.set_tbl_cols(-1)

+# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
+# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
+# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
+# re-derive ourselves). measures=20100 selects the absolute count.
+BASE_URL = (
+    "https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
+    "?geography=TYPE151"
+    "&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
+    "&measures=20100"
+    "&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
+)
+PAGE_SIZE = 25000

-URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"
-
-GEOGRAPHY_CODE_REPLACEMENTS = {
-    # 2023 Cumberland unitary authority
-    "E07000026": "E06000063",  # Allerdale
-    "E07000028": "E06000063",  # Carlisle
-    "E07000029": "E06000063",  # Copeland
-    # 2023 Westmorland and Furness unitary authority
-    "E07000027": "E06000064",  # Barrow-in-Furness
-    "E07000030": "E06000064",  # Eden
-    "E07000031": "E06000064",  # South Lakeland
-    # 2023 North Yorkshire unitary authority
-    "E07000163": "E06000065",  # Craven
-    "E07000164": "E06000065",  # Hambleton
-    "E07000165": "E06000065",  # Harrogate
-    "E07000166": "E06000065",  # Richmondshire
-    "E07000167": "E06000065",  # Ryedale
-    "E07000168": "E06000065",  # Scarborough
-    "E07000169": "E06000065",  # Selby
-    # 2023 Somerset unitary authority
-    "E07000187": "E06000066",  # Mendip
-    "E07000188": "E06000066",  # Sedgemoor
-    "E07000189": "E06000066",  # South Somerset
-    "E07000246": "E06000066",  # Somerset West and Taunton
+# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
+# The split mirrors the previous Local-Authority source exactly:
+#   * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
+#     Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
+#     Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
+#     avoids inflating "% South Asian". The split is approximate (the bucket also
+#     holds some South Asian groups such as Sri Lankan/Nepalese).
+GROUP_MAP = {
+    # White
+    "White: English, Welsh, Scottish, Northern Irish or British": "White",
+    "White: Irish": "White",
+    "White: Gypsy or Irish Traveller": "White",
+    "White: Roma": "White",
+    "White: Other White": "White",
+    # South Asian
+    "Asian, Asian British or Asian Welsh: Indian": "South Asian",
+    "Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
+    "Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
+    # East / Southeast Asian
+    "Asian, Asian British or Asian Welsh: Chinese": "East Asian",
+    "Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
+    # Black
+    "Black, Black British, Black Welsh, Caribbean or African: African": "Black",
+    "Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
+    "Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
+    # Mixed
+    "Mixed or Multiple ethnic groups: White and Asian": "Mixed",
+    "Mixed or Multiple ethnic groups: White and Black African": "Mixed",
+    "Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
+    "Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
+    # Other
+    "Other ethnic group: Arab": "Other",
+    "Other ethnic group: Any other ethnic group": "Other",
 }

+# The 6 output groups, in a fixed order so the largest-remainder rounding below
+# is deterministic regardless of pivot column ordering.
+OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
+assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
+    "GROUP_MAP values must be exactly the OUTPUT_GROUPS"
+)
+

 def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
-    # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
-    # then aggregate back to the broad groups plus a South Asian / East/SE Asian
-    # split (Indian/Pakistani/Bangladeshi vs Chinese + other East/SE Asian).
-    detailed = df.filter(
-        (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All")
+    """Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
+
+    `df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
+    C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
+    missing/extra/relabelled leaf category would silently drop people from the
+    denominator, so we validate the category set against GROUP_MAP first and
+    fail loudly otherwise.
+    """
+    found = set(df["C2021_ETH_20_NAME"].unique().to_list())
+    expected = set(GROUP_MAP)
+    if found != expected:
+        missing = sorted(expected - found)
+        unexpected = sorted(found - expected)
+        raise ValueError(
+            "Census ethnic-group categories do not match the expected NOMIS "
+            "TS021 C2021_ETH_20 leaf set.\n"
+            f"  expected {len(expected)} categories, found {len(found)}\n"
+            f"  missing:    {missing}\n"
+            f"  unexpected: {unexpected}\n"
+            "Refusing to compute percentages against an unrecognised breakdown."
+        )
+
+    # Map each leaf to its output group and sum counts per (LSOA, group). Summing
+    # counts (not rounded percentages) keeps the denominator exact.
+    grouped = (
+        df.with_columns(
+            pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
+            pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
+        )
+        .group_by("GEOGRAPHY_CODE", "group")
+        .agg(pl.col("_count").sum())
+    )
+    wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
+        {"GEOGRAPHY_CODE": "lsoa21"}
    )

-    # Map detailed categories to our output groups
-    group_map = {
-        # White
-        "White British": "White",
-        "White Irish": "White",
-        "Gypsy Or Irish Traveller": "White",
-        "Roma": "White",
-        "Any Other White Background": "White",
-        # South Asian
-        "Indian": "South Asian",
-        "Pakistani": "South Asian",
-        "Bangladeshi": "South Asian",
-        # East / Southeast Asian. The ONS "Any Other Asian Background" bucket is
-        # predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
-        # Japanese, Korean, ...) rather than South Asian, so route it here rather
-        # than inflating "% South Asian". The split is approximate (the ONS
-        # bucket also holds some South Asian groups such as Sri Lankan/Nepalese).
-        "Chinese": "East/SE Asian",
-        "Any Other Asian Background": "East/SE Asian",
-        # Black
-        "Black African": "Black",
-        "Black Caribbean": "Black",
-        "Any Other Black Background": "Black",
-        # Mixed
-        "Mixed White And Asian": "Mixed",
-        "Mixed White And Black African": "Mixed",
-        "Mixed White And Black Caribbean": "Mixed",
-        "Any Other Mixed/Multiple Ethnic Background": "Mixed",
-        # Other
-        "Arab": "Other",
-        "Any Other Ethnic Background": "Other",
-    }
+    # A group with no people in an LSOA is absent from the long rows, so the pivot
+    # leaves a null; treat it as 0 before normalising.
+    wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))

-    detailed = detailed.with_columns(
-        pl.col("Ethnicity").replace_strict(group_map).alias("group"),
-        pl.col("Geography_code")
-        .replace(GEOGRAPHY_CODE_REPLACEMENTS)
-        .alias("output_geography_code"),
-        pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"),
-    )
-
-    # Sum counts, not rounded percentages, so old districts can be safely
-    # recombined into their current unitary authorities.
-    grouped = detailed.group_by("output_geography_code", "group").agg(
-        pl.col("_population").sum()
-    )
-    wide = grouped.pivot(
-        on="group", index="output_geography_code", values="_population"
-    ).rename({"output_geography_code": "Geography_code"})
-
-    # Normalize so each row sums to exactly 100%, then round using largest-remainder
-    # method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
-    group_cols = [c for c in wide.columns if c != "Geography_code"]
-    row_total = sum(pl.col(c) for c in group_cols)
-    # Scale each group so they sum to exactly 100
+    # Normalize so each row sums to exactly 100%, then round with the
+    # largest-remainder method to preserve the sum. Independent rounding of 6
+    # values can drift +/-0.3.
+    row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
    wide = wide.with_columns(
-        [(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols]
+        [(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
    )
-    # Round to 1 decimal, then adjust the largest group to absorb residual
-    rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols]
-    wide = wide.with_columns(rounded_cols)
-    rounded_sum = sum(pl.col(c) for c in group_cols)
+    # Round to 1 decimal, then adjust the largest group to absorb the residual.
+    wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
+    rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
    residual = (100.0 - rounded_sum).round(1)
-    # Find which group is largest per row and add the residual there
-    largest_col = pl.concat_list(group_cols).list.arg_max()
+    largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
    wide = wide.with_columns(
        [
            pl.when(largest_col == i)
            .then(pl.col(c) + residual)
            .otherwise(pl.col(c))
            .alias(c)
-            for i, c in enumerate(group_cols)
+            for i, c in enumerate(OUTPUT_GROUPS)
        ]
    )

-    # Rename columns to be descriptive
-    rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
-    wide = wide.rename(rename_map)
-    return wide
+    rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
+    return wide.rename(rename_map)


 def download_and_convert(output_path: Path) -> None:
-    print("Downloading ethnicity data...")
-    response = httpx.get(URL, follow_redirects=True, timeout=60)
-    response.raise_for_status()
+    print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
+    frames = []
+    offset = 0
+    while True:
+        url = f"{BASE_URL}&recordoffset={offset}"
+        response = httpx.get(url, follow_redirects=True, timeout=120)
+        response.raise_for_status()
+        if len(response.content) == 0:
+            break
+        chunk = pl.read_csv(BytesIO(response.content))
+        if chunk.height == 0:
+            break
+        frames.append(chunk)
+        print(f"  Fetched {chunk.height} rows (offset={offset})")
+        if chunk.height < PAGE_SIZE:
+            break
+        offset += PAGE_SIZE

-    df = pl.read_csv(response.content)
-    print(f"Raw shape: {df.head(100)}")
+    df = pl.concat(frames)
+    print(f"Total rows: {df.height}")
+
+    # Filter to England only (E-prefixed LSOA codes); the merge joins on the
+    # English postcode universe and the IoD coverage check is England-wide.
+    df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))

    wide = _ethnicity_percentages(df)

-    print(f"Output shape: {wide.shape}")
+    print(f"England LSOAs: {wide.height}")
    print(f"Columns: {wide.columns}")

+    output_path.parent.mkdir(parents=True, exist_ok=True)
    wide.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


 def main() -> None:
    parser = argparse.ArgumentParser(
-        description="Download and convert ethnicity by local authority data"
+        description="Download Census 2021 ethnic group (TS021) by LSOA"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
--- a/pipeline/download/gias.py
+++ b/pipeline/download/gias.py
@ -192,6 +192,10 @@ def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
        infer_schema_length=20000,
        null_values=_NULL_VALUES,
        truncate_ragged_lines=True,
+        # Force the phone number to stay a string: schema inference reads it as
+        # an integer and strips the leading 0 (e.g. 020 8427 7222 -> 2084277222),
+        # making nearly every school phone number un-diallable.
+        schema_overrides={"TelephoneNum": pl.String},
    )


--- a/pipeline/download/test_ethnicity.py
+++ b/pipeline/download/test_ethnicity.py
@ -1,65 +1,118 @@
 import polars as pl
+import pytest

-from pipeline.download.ethnicity import _ethnicity_percentages
+from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages


-def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
-    rows = []
-    for code, white, indian in [
-        ("E07000026", 80, 20),
-        ("E07000028", 10, 90),
-    ]:
-        total = white + indian
-        rows.extend(
-            [
-                {
-                    "Geography_code": code,
-                    "Ethnicity_type": "ONS 2021 19+1",
-                    "Ethnicity": "White British",
-                    "Ethnic Population": white,
-                    "Value1": white / total * 100,
-                },
-                {
-                    "Geography_code": code,
-                    "Ethnicity_type": "ONS 2021 19+1",
-                    "Ethnicity": "Indian",
-                    "Ethnic Population": indian,
-                    "Value1": indian / total * 100,
-                },
-            ]
-        )
+def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
+    """Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.

-    result = _ethnicity_percentages(pl.DataFrame(rows))
-
-    cumberland = result.filter(pl.col("Geography_code") == "E06000063")
-    assert cumberland.select("% White", "% South Asian").to_dicts() == [
-        {"% White": 45.0, "% South Asian": 55.0}
-    ]
-
-
-def test_ethnicity_routes_any_other_asian_to_east_se_asian():
-    """'Any Other Asian Background' and 'Chinese' both fold into '% East/SE Asian'
-    (not '% South Asian'), fixing the East/SE Asian undercount."""
-    rows = [
+    Every one of the 19 leaf categories must be present in the download (NOMIS
+    emits a 0-count row when an LSOA has none), so categories not given default
+    to 0 to mirror that.
+    """
+    return [
        {
-            "Geography_code": "E06000001",
-            "Ethnicity_type": "ONS 2021 19+1",
-            "Ethnicity": ethnicity,
-            "Ethnic Population": pop,
-            "Value1": 0.0,
+            "GEOGRAPHY_CODE": geo,
+            "C2021_ETH_20_NAME": label,
+            "OBS_VALUE": counts.get(label, 0),
        }
-        for ethnicity, pop in [
-            ("Chinese", 30),
-            ("Any Other Asian Background", 20),
-            ("Indian", 50),
-        ]
+        for label in GROUP_MAP
    ]

-    result = _ethnicity_percentages(pl.DataFrame(rows))
-    area = result.filter(pl.col("Geography_code") == "E06000001")
+
+def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
+    df = pl.DataFrame(
+        _long_rows(
+            "E01000001",
+            {
+                "White: English, Welsh, Scottish, Northern Irish or British": 60,
+                "White: Other White": 10,
+                "Asian, Asian British or Asian Welsh: Indian": 20,
+                "Black, Black British, Black Welsh, Caribbean or African: African": 10,
+            },
+        )
+    )
+
+    result = _ethnicity_percentages(df)
+
+    assert result.columns[0] == "lsoa21"
+    assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
+    row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
+    assert row["% White"] == 70.0
+    assert row["% South Asian"] == 20.0
+    assert row["% Black"] == 10.0
+    # Percentages always sum to exactly 100 (largest-remainder rounding).
+    assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
+
+
+def test_ethnicity_routes_other_asian_to_east_se_asian():
+    """'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
+    '% South Asian'), preserving the East/SE Asian split from the LAD source."""
+    df = pl.DataFrame(
+        _long_rows(
+            "E01000002",
+            {
+                "Asian, Asian British or Asian Welsh: Chinese": 30,
+                "Asian, Asian British or Asian Welsh: Other Asian": 20,
+                "Asian, Asian British or Asian Welsh: Indian": 50,
+            },
+        )
+    )
+
+    result = _ethnicity_percentages(df)
+    area = result.filter(pl.col("lsoa21") == "E01000002")

    assert "% East/SE Asian" in result.columns
    assert "% East Asian" not in result.columns
    assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
        {"% East/SE Asian": 50.0, "% South Asian": 50.0}
    ]
+
+
+def test_ethnicity_percentages_independent_per_lsoa():
+    """Two LSOAs get independent profiles — the LSOA granularity is the point."""
+    df = pl.concat(
+        [
+            pl.DataFrame(
+                _long_rows(
+                    "E01000010",
+                    {"White: Other White": 100},
+                )
+            ),
+            pl.DataFrame(
+                _long_rows(
+                    "E01000011",
+                    {"Asian, Asian British or Asian Welsh: Pakistani": 100},
+                )
+            ),
+        ]
+    )
+
+    result = _ethnicity_percentages(df).sort("lsoa21")
+
+    assert result["% White"].to_list() == [100.0, 0.0]
+    assert result["% South Asian"].to_list() == [0.0, 100.0]
+
+
+def test_ethnicity_percentages_rejects_unexpected_category():
+    rows = _long_rows("E01000003", {"White: Other White": 10})
+    rows.append(
+        {
+            "GEOGRAPHY_CODE": "E01000003",
+            "C2021_ETH_20_NAME": "White: A Brand New Census Category",
+            "OBS_VALUE": 5,
+        }
+    )
+
+    with pytest.raises(ValueError, match="do not match the expected"):
+        _ethnicity_percentages(pl.DataFrame(rows))
+
+
+def test_ethnicity_percentages_rejects_missing_category():
+    # Drop one leaf entirely: its people would vanish from the denominator.
+    rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
+            r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]
+
+    with pytest.raises(ValueError, match="missing"):
+        _ethnicity_percentages(pl.DataFrame(rows))
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -1011,11 +1011,6 @@ def main() -> None:
        action="store_true",
        help="Skip TfL TransXChange download and conversion",
    )
-    parser.add_argument(
-        "--skip-national-rail",
-        action="store_true",
-        help="Skip National Rail CIF download and conversion",
-    )
    args = parser.parse_args()

    output_dir: Path = args.output
@ -1039,13 +1034,20 @@ def main() -> None:
        download_tfl_transxchange(raw_dir)
        convert_tfl_to_gtfs(raw_dir, output_dir)

-    # 3. National Rail CIF → GTFS
-    if args.skip_national_rail:
-        print("Skipping National Rail (--skip-national-rail)")
-    else:
-        cif = download_national_rail_cif(raw_dir)
-        if cif is not None:
-            convert_national_rail_to_gtfs(raw_dir, output_dir)
+    # 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
+    # reach the ~2,725 railway-station destinations, so a bus/TfL-only network
+    # silently overstates every train commute. Missing credentials are a HARD
+    # error, so a rail-less network can never ship.
+    cif = download_national_rail_cif(raw_dir)
+    if cif is None:
+        raise RuntimeError(
+            "National Rail timetable was not downloaded — set "
+            "NATIONAL_RAIL_EMAIL / NATIONAL_RAIL_PASSWORD (register free at "
+            "https://opendata.nationalrail.co.uk/). National Rail heavy rail is "
+            "required; without it the transit network models every train journey "
+            "as bus-only and overstates commute times."
+        )
+    convert_national_rail_to_gtfs(raw_dir, output_dir)

    # Summary
    print()
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -273,27 +273,24 @@ def _write_avg_yr(
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

-    # Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
-    # bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
-    # average over the years in which ANY of those types occurred. This keeps the
-    # headline equal to the mean of the "Serious/Minor crime (by year)" bars.
-    # Summing the per-type avg/yr values instead (as the merge previously did)
-    # divides each type by its OWN years-present and overstates the rollup when a
-    # postcode's serious/minor types occur in disjoint years.
+    # Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
+    # columns, so each rollup always equals the sum of the parts shown beside it
+    # and can never fall below one of its own components. (Previously the rollup
+    # re-derived a union-years-present mean: it divided the summed counts by the
+    # number of years in which ANY component type occurred, whereas each
+    # component divides by its OWN years-present. When a postcode's serious/minor
+    # types occurred in disjoint years the union denominator was larger, so the
+    # rollup came out smaller than the sum of its parts.) The by-year rollup
+    # series in _write_by_year is likewise the per-year sum of the component
+    # bars, so headline and chart both present the rollup as the sum of its parts.
    for rollup_name, rollup_types in (
        ("Serious crime", SERIOUS_CRIME_TYPES),
        ("Minor crime", MINOR_CRIME_TYPES),
    ):
        rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
-        rollup_counts = counts[:, rollup_idx, :].sum(axis=1)  # (n_postcodes, n_years)
-        rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
-        rollup_years_present = np.clip(
-            (rollup_counts > 0).sum(axis=1), 1, None
-        ).astype(np.float64)
-        rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
-        data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
-            np.float32
-        )
+        data[f"{rollup_name} (avg/yr)"] = np.round(
+            avg[:, rollup_idx].sum(axis=1), 1
+        ).astype(np.float32)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -36,6 +36,16 @@ MIN_PRICE = 10_000
 MIN_BUILD_YEAR = 1700
 MAX_BUILD_YEAR = 2030

+# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
+# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
+# habitable rooms) that otherwise propagate verbatim into the published per-
+# property columns. Values outside these bands are nulled (treated as unknown)
+# rather than shown. Bounds are deliberately wide so only clear errors are cut.
+MIN_FLOOR_HEIGHT_M = 1.5  # below this a storey is not habitable
+MAX_FLOOR_HEIGHT_M = 6.0  # above this is a data error, not a normal storey
+MAX_TOTAL_FLOOR_AREA_M2 = 2000.0  # ~21,500 sqft; larger is a bulk/garbage record
+MAX_HABITABLE_ROOMS = 20  # dwellings above this are data errors
+

 def epc_band_to_year(band: pl.Expr) -> pl.Expr:
    """Map an EPC construction age band to a single representative build year.
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
        )
        .filter(pl.col("epc_address").is_not_null())
        .with_columns(
-            pl.when(pl.col("number_habitable_rooms") == 0)
-            .then(None)
-            .otherwise(pl.col("number_habitable_rooms"))
+            # Null implausible EPC dimensions so data-entry errors don't reach
+            # the published per-property columns (Interior height, Total floor
+            # area, Number of bedrooms & living rooms). Treated as unknown.
+            pl.when(
+                (pl.col("number_habitable_rooms") >= 1)
+                & (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
+            )
+            .then(pl.col("number_habitable_rooms"))
+            .otherwise(None)
            .alias("number_habitable_rooms"),
+            pl.when(
+                pl.col("floor_height").is_between(
+                    MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
+                )
+            )
+            .then(pl.col("floor_height"))
+            .otherwise(None)
+            .alias("floor_height"),
+            pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
+            .then(pl.col("total_floor_area"))
+            .otherwise(None)
+            .alias("total_floor_area"),
        )
    )

--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -2,6 +2,7 @@ import argparse
 import re
 import tempfile
 from dataclasses import dataclass
+from datetime import date
 from typing import Literal

 import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
 CONSERVATION_AREA_FEATURE = "Within conservation area"
-TREE_DENSITY_FEATURE = "Street tree density percentile"
+# Named "Tree canopy" (not "Street tree") because the underlying density unions
+# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
+# woodland-edge postcode's score reflects forest canopy, not only street trees.
+TREE_DENSITY_FEATURE = "Tree canopy density percentile"
 LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:


 def _is_current_planning_record(end_date: object) -> bool:
+    """A planning record is current when it has no end-date OR its end-date is
+    still in the future. The planning.data.gov.uk `end-date` field marks when a
+    designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
+    area and must NOT be dropped — the previous "any non-empty date = ended"
+    logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
    if end_date is None:
        return True
    if isinstance(end_date, str):
-        return end_date.strip() == ""
+        text = end_date.strip()
+        if text == "":
+            return True
+        try:
+            return date.fromisoformat(text[:10]) > date.today()
+        except ValueError:
+            # Unparseable end-date: keep the record rather than silently drop it.
+            return True
    return False


@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
    )


+def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
+    """Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
+
+    Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
+    like median age and IoD. The IoD table defines the LSOA universe every
+    postcode resolves into, so a missing LSOA would silently null the ethnicity
+    columns for those postcodes; require full coverage instead.
+    """
+    iod_lsoas = pl.read_parquet(
+        iod_path, columns=["LSOA code (2021)"]
+    ).rename({"LSOA code (2021)": "lsoa21"})
+
+    ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
+    missing_ethnicity = iod_lsoas.join(
+        ethnicity_lsoas, on="lsoa21", how="anti"
+    ).sort("lsoa21")
+    if missing_ethnicity.height > 0:
+        raise ValueError(
+            "Ethnicity data is missing LSOA coverage: "
+            f"{missing_ethnicity.height} LSOAs, e.g. "
+            f"{missing_ethnicity.head(10).to_dicts()}"
+        )
+
+
 def _validate_lad_source_coverage(
-    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
+    iod_path: Path, rental_prices_path: Path
 ) -> None:
    iod_lads = (
        pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
        .unique(["lad"])
    )

-    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
-        {"Geography_code": "lad"}
-    )
-    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
-    if missing_ethnicity.height > 0:
-        raise ValueError(
-            "Ethnicity data is missing 2024 LAD coverage: "
-            f"{missing_ethnicity.to_dicts()}"
-        )
-
    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
        {"area_code": "lad"}
    )
@ -849,12 +879,10 @@ def _join_area_side_tables(
    broadband: pl.LazyFrame,
 ) -> pl.LazyFrame:
    base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
-    base = base.join(
-        ethnicity,
-        left_on="Local Authority District code (2024)",
-        right_on="Geography_code",
-        how="left",
-    )
+    # Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
+    # `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
+    # Local-Authority broadcast, with no change to the 6-bucket output schema.
+    base = base.join(ethnicity, on="lsoa21", how="left")

    # Crime is counted spatially per postcode (incidents within 50m of the
    # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
    """
    if mode == "listings" and actual_listings_path is None:
        raise ValueError("listings mode requires actual_listings_path")
-    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
+    _validate_lsoa_source_coverage(iod_path, ethnicity_path)
+    _validate_lad_source_coverage(iod_path, rental_prices_path)

    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
        "--ethnicity",
        type=Path,
        required=True,
-        help="Ethnicity by local authority parquet file (optional)",
+        help="Census 2021 ethnic group (TS021) by LSOA parquet file",
    )
    parser.add_argument(
        "--crime",
--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
 # tolerance), we fatten it just enough to survive snapping rather than drop it.
 _MIN_FOOTPRINT_BUFFER_M = 0.5

+# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
+# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
+# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
+# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
+# building-scale footprint instead. (Genuine thin slivers, which still carry
+# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
+# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
+# falls through to the tiny _grid_footprint, so this can only improve the result.
+_POINT_RESCUE_BUFFER_M = 8.0
+_POINTLIKE_AREA_M2 = 1.0
+_POINTLIKE_PERIMETER_M = 4.0
+

 def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
    """Transform a BNG polygon to WGS84, snap to output precision, validate.
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:


 def _rescue_footprint(geom_bng) -> dict | None:
-    """Fatten a degenerate BNG geometry into a representable footprint and snap."""
-    footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
+    """Fatten a degenerate BNG geometry into a representable footprint and snap.
+
+    A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon — the
+    signature of a tower-block postcode whose UPRNs all share one coordinate)
+    gets a building-scale buffer so it is not reduced to an invisible sub-metre
+    dot; thin slivers that still carry length keep the minimal buffer.
+    """
+    buffer_m = _MIN_FOOTPRINT_BUFFER_M
+    try:
+        if (
+            geom_bng.area < _POINTLIKE_AREA_M2
+            and geom_bng.length < _POINTLIKE_PERIMETER_M
+        ):
+            buffer_m = _POINT_RESCUE_BUFFER_M
+    except GEOSException:
+        pass
+    footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
    if footprint is None:
        return None
    return _snap_to_wgs84_geojson(footprint)
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -906,6 +906,37 @@ class TestToWgs84Geojson:
        assert result is not None
        assert result["type"] == "Polygon"

+    def test_pointlike_input_gets_building_scale_footprint(self):
+        """A tower-block postcode (all UPRNs at one point) must not collapse to a
+        sub-metre dot; it gets a building-scale footprint instead."""
+        import pyproj
+        from shapely.geometry import Point, shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        result = to_wgs84_geojson(Point(360000, 170000))
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
+
+    def test_thin_sliver_keeps_minimal_buffer(self):
+        """A genuine elongated sliver still carries length, so it is NOT inflated
+        to building scale — only truly pointlike inputs are."""
+        import pyproj
+        from shapely.geometry import LineString, shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
+        result = to_wgs84_geojson(sliver)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
+
    def test_coordinates_have_limited_precision(self):
        """GeoJSON coordinates should be rounded to 6 decimal places."""
        import json
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -230,11 +230,28 @@ def main():
    ).height
    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")

+    # Null the absolute "Estimated current price" itself when its implied
+    # per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
+    # AND the floor area is known: these come from bulk/block transfers or
+    # garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
+    # estimate) and are not meaningful single-dwelling values. Previously only
+    # the derived per-sqm was nulled, leaving the absurd headline price visible.
+    _raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
+    df = df.with_columns(
+        pl.when(
+            pl.col("Estimated current price").is_not_null()
+            & pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
+        )
+        .then(None)
+        .otherwise(pl.col("Estimated current price"))
+        .alias("Estimated current price"),
+    )
+
    # Derive estimated price per sqm where both estimated price and floor area
-    # exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
-    # MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
-    # from bulk/block transactions or floor-area errors and are not meaningful
-    # per-unit prices.
+    # exist. Now that the implausible-psm estimates are nulled above, the band
+    # filter here mainly guards the floor-area>0 case.
    _est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
    df = df.with_columns(
        pl.when(
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
+    LATEST_COMPLETE_YEAR,
    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
    """
-    pairs = extract_pairs(input_path, max_year2=max_pair_year)
+    # Solve the index only on COMPLETE calendar years: exclude the partial
+    # current year, whose thin repeat-sale set yields wild betas. The index is
+    # still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
+    # follows the established trend rather than a partial-year spike. Backtest
+    # passes a stricter max_pair_year, which is honoured.
+    estimation_cap = (
+        max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
+    )
+    pairs = extract_pairs(input_path, max_year2=estimation_cap)
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
    max_year = CURRENT_YEAR

    hedonic_idx = compute_hedonic_index(
-        input_path, min_year, max_year, max_sale_year=max_pair_year
+        input_path, min_year, max_year, max_sale_year=estimation_cap
    )

    # Precompute hierarchy
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -6,6 +6,13 @@ import numpy as np
 import polars as pl

 CURRENT_YEAR = 2026
+# Latest COMPLETE calendar year. The current year's transactions are only
+# partially reported (Land Registry lags ~2-3 months), so a sector's thin
+# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
+# single sector). The index is SOLVED only on complete years (<= this) and
+# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
+# projections follow the established trend instead of a partial-year spike.
+LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
 _today = date.today()
 CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12

--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
 }


-def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
+# Age thresholds for deciding which phase(s) a school serves. A school serves
+# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
+# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
+# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
+# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
+# phase" labels such schools as just "Secondary", which previously hid them from
+# every postcode's primary-school count.
+PRIMARY_MAX_AGE = 10
+SECONDARY_MIN_AGE = 12
+
+
+def classify_good_plus_schools(
+    ofsted: pl.DataFrame, open_urns: set[int] | None = None
+) -> pl.DataFrame:
    """Label good+/outstanding primary & secondary schools for proximity counts.

-    Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
-    returning a ``(postcode, category)`` frame.
+    Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
+    ``category`` rows per school, returning a ``(postcode, category)`` frame.

    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
    that column is null/"Not judged" for them even when they are demonstrably
    good — their status lives in "Ungraded inspection overall outcome" ("School
-    remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
-    variants). Filtering on the graded column alone dropped ~7,000 genuinely
-    good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
-    there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
-    is never overridden.
+    remains Good"/"School remains Outstanding"). Filtering on the graded column
+    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
+    ungraded outcome, but ONLY when there is no usable graded result
+    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
+
+    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
+    (Concerns)" outcome signals inspectors found issues warranting an earlier
+    graded re-inspection, so marketing it as a good+ school is misleading.
+
+    Phase assignment uses the statutory age range when available (so all-through
+    and middle schools count toward BOTH primary and secondary), falling back to
+    the coarse "Ofsted phase" label when age columns are absent. When
+    ``open_urns`` is given, schools whose URN is not in the current GIAS open
+    register are dropped so closed/merged schools are not counted.
    """
    # Cast to Utf8 so the string predicates below are well-defined even if a
    # column happens to be entirely null (read back as a Null dtype).
    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    has_concern = ungraded.str.contains(r"\(Concerns\)")
+    remains_outstanding = (
+        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
+    )
+    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
    graded = (
        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
        .with_columns(
            pl.when(oeif.is_in(["1", "2"]))
            .then(oeif)
-            .when(
-                no_usable_grade
-                & ungraded.str.starts_with("School remains Outstanding")
-            )
+            .when(no_usable_grade & remains_outstanding)
            .then(pl.lit("1"))
-            .when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
+            .when(no_usable_grade & remains_good)
            .then(pl.lit("2"))
            .otherwise(None)
            .alias("_ofsted_grade")
        )
        .filter(pl.col("_ofsted_grade").is_not_null())
    )
+
+    # Drop schools no longer open (closed/merged) when the GIAS open register is
+    # provided, so stale Ofsted "latest inspection" rows are not counted.
+    if open_urns is not None and "URN" in graded.columns:
+        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
+
+    # Decide which phase(s) each school serves.
+    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
+        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
+        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
+        serves_primary = (
+            pl.when(low.is_not_null())
+            .then(low <= PRIMARY_MAX_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Primary")
+        )
+        serves_secondary = (
+            pl.when(high.is_not_null())
+            .then(high >= SECONDARY_MIN_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Secondary")
+        )
+    else:
+        serves_primary = pl.col("Ofsted phase") == "Primary"
+        serves_secondary = pl.col("Ofsted phase") == "Secondary"
+
+    graded = graded.with_columns(
+        serves_primary.alias("_serves_primary"),
+        serves_secondary.alias("_serves_secondary"),
+    )
+
    # Good+ groups include both grade variants; outstanding groups count grade 1.
-    return graded.with_columns(
-        pl.when(pl.col("Ofsted phase") == "Primary")
-        .then(
-            pl.when(pl.col("_ofsted_grade") == "1")
-            .then(pl.lit("outstanding_primary"))
-            .otherwise(pl.lit("good_primary"))
-        )
-        .otherwise(
-            pl.when(pl.col("_ofsted_grade") == "1")
-            .then(pl.lit("outstanding_secondary"))
-            .otherwise(pl.lit("good_secondary"))
-        )
+    # A school can yield up to two rows (primary and secondary).
+    primary = graded.filter(pl.col("_serves_primary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_primary"))
+        .otherwise(pl.lit("good_primary"))
        .alias("category")
-    ).select(
+    )
+    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_secondary"))
+        .otherwise(pl.lit("good_secondary"))
+        .alias("category")
+    )
+    return pl.concat([primary, secondary]).select(
        pl.col("Postcode").alias("postcode"),
        "category",
    )
@ -85,12 +138,24 @@ def main():
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
    )
+    parser.add_argument(
+        "--gias",
+        type=Path,
+        default=None,
+        help="GIAS open-school parquet; if given, only currently-open schools are counted",
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet path"
    )
    args = parser.parse_args()

-    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
+    open_urns: set[int] | None = None
+    if args.gias is not None:
+        gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
+        open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
+        print(f"GIAS open register: {len(open_urns):,} open school URNs")
+
+    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
    if ofsted.is_empty():
        raise ValueError("No good+ primary/secondary Ofsted schools found")

--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
    assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}


-def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
+def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
    # Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
    # 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
-    # "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
-    # bars (which span the UNION of years any serious type occurred), NOT the sum
-    # of the per-type means. Summing per-type means divides each type by its OWN
-    # years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
-    # per-year serious total by the years any serious type occurred (2) -> 12.
+    # "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
+    # (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
+    # shown beside it and can never fall below a single component. (The previous
+    # union-years-present mean would have divided the per-year serious total by the
+    # 2 years any serious type occurred, giving a misleading 12 that sits below
+    # both the burglary and robbery rollup contributions.)
    units = tmp_path / "units"
    _write_boundaries(
        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)

    avg = pl.read_parquet(output).row(0, named=True)
-    # The precomputed rollup headline exists and equals the mean of the bars (12),
-    # not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
    assert "Serious crime (avg/yr)" in avg
    assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
    assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+    # Rollup == sum of its component (avg/yr) columns.
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(
+        avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
+    )

+    # The by-year rollup series remains the per-year sum of the component bars.
    serious_bars = {
        p["year"]: p["count"]
        for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
        2014: pytest.approx(12.0, abs=0.05),
        2024: pytest.approx(12.0, abs=0.05),
    }
-    mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)


 def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -34,6 +34,7 @@ from pipeline.transform.merge import (
    _split_normal_outputs,
    _tree_density_by_postcode,
    _validate_lad_source_coverage,
+    _validate_lsoa_source_coverage,
    _validate_postcode_feature_output,
    _validate_property_postcodes,
 )
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
    tmp_path,
 ) -> None:
    iod_path = tmp_path / "iod.parquet"
-    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
            ],
        }
    ).write_parquet(iod_path)
-    pl.DataFrame(
-        {"Geography_code": ["E08000016", "E06000053", "E09000001"]}
-    ).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

-    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+    _validate_lad_source_coverage(iod_path, rental_path)


 def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
    iod_path = tmp_path / "iod.parquet"
-    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
            "Local Authority District name (2024)": ["Barnsley"],
        }
    ).write_parquet(iod_path)
-    pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

    with pytest.raises(ValueError, match="Rental data is missing"):
-        _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+        _validate_lad_source_coverage(iod_path, rental_path)
+
+
+def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
+    tmp_path,
+) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
+        iod_path
+    )
+    # Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
+    # LSOAs are required to all be present.
+    pl.DataFrame(
+        {"lsoa21": ["E01000001", "E01000002", "E01000003"]}
+    ).write_parquet(ethnicity_path)
+
+    _validate_lsoa_source_coverage(iod_path, ethnicity_path)
+
+
+def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
+        iod_path
+    )
+    pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
+
+    with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
+        _validate_lsoa_source_coverage(iod_path, ethnicity_path)


 def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    joined = _join_area_side_tables(
        base,
        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
-        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        crime=crime,
        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
            "Property type": ["Terraced", None],
            "Leasehold/Freehold": ["Leasehold", None],
            "Last known price": [500_000, None],
-            "Street tree density percentile": [42.0, 42.0],
+            "Tree canopy density percentile": [42.0, 42.0],
            # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
            "_actual_listing_url": ["url0", "url1"],
            "_actual_asking_price": [600_000, 700_000],
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
-            "Street tree density percentile": pl.Float32,
+            "Tree canopy density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
    assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
    assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
    # Postcode-level feature carried through to both matched and unmatched rows.
-    assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
+    assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
    # Match status reflects historical context availability.
    assert finalized["Historical property match status"].to_list() == [
        "matched",
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
            "Property type": ["Terraced", "Terraced"],
            "Leasehold/Freehold": ["Leasehold", "Leasehold"],
            "Last known price": [500_000, 480_000],
-            "Street tree density percentile": [42.0, 42.0],
+            "Tree canopy density percentile": [42.0, 42.0],
            # Same listing URL on both collapsed rows — the fan-out to fix.
            "_actual_listing_url": ["url0", "url0"],
            "_actual_asking_price": [600_000, 600_000],
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
-            "Street tree density percentile": pl.Float32,
+            "Tree canopy density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
--- a/pipeline/transform/test_school_proximity.py
+++ b/pipeline/transform/test_school_proximity.py
@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
    rows = [
        _school("Primary", None, "School remains Good", "AA1 1AA"),
        _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
-        # "(Concerns)"/"(Improving)" variants are still good+.
+        # "(Improving)" is still good+ ...
+        _school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "good_primary"),
+        ("AA1 1AB", "outstanding_secondary"),
+        ("AA1 1AE", "good_primary"),
+    }
+
+
+def test_ungraded_concerns_are_not_good_plus():
+    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
+    # must NOT be counted as good+ schools.
+    rows = [
        _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
        _school(
            "Secondary",
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
            "AA1 1AD",
        ),
    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AB", "outstanding_secondary"),
-        ("AA1 1AC", "good_primary"),
-        ("AA1 1AD", "outstanding_secondary"),
-    }
+    assert _classify(rows) == set()


 def test_ungraded_non_good_outcomes_are_excluded():
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
        _school("Not applicable", "2", None),
    ]
    assert _classify(rows) == set()
+
+
+def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
+    return {
+        "Postcode": postcode,
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": None,
+        "URN": 100000,
+        "Statutory lowest age": low,
+        "Statutory highest age": high,
+    }
+
+
+def test_all_through_school_counts_toward_both_primary_and_secondary():
+    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
+    # serves primary-age children too, so it must count in BOTH metrics.
+    rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
+    assert _classify(rows) == {
+        ("AA1 1AA", "good_primary"),
+        ("AA1 1AA", "good_secondary"),
+    }
+
+
+def test_age_ranges_assign_single_phase_for_standard_schools():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),  # primary only
+        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),  # secondary only
+        _aged_school("Secondary", "1", 9, 13, "AA1 1AC"),  # middle -> both
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "outstanding_primary"),
+        ("AA1 1AB", "good_secondary"),
+        ("AA1 1AC", "outstanding_primary"),
+        ("AA1 1AC", "outstanding_secondary"),
+    }
+
+
+def test_closed_schools_excluded_when_open_register_given():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),
+        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
+    ]
+    rows[0]["URN"] = 111
+    rows[1]["URN"] = 222
+    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
+    pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
+    # URN 222 is not in the open register, so it is dropped.
+    assert pairs == {("AA1 1AA", "outstanding_primary")}
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
    "emergency/water_tank",
    "leisure/bleachers",
    "leisure/schoolyard",
+    # Park "furniture" / incidental features — not parks; they massively
+    # inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
+    "leisure/bandstand",
+    "leisure/bird_hide",
+    "leisure/firepit",
+    "leisure/outdoor_seating",
+    "leisure/picnic_table",
+    "leisure/wildlife_hide",
    "public_transport/pay_scale_area",
    "shop/taxi",
    "amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
    "tourism/village_sign",
    "tourism/wilderness_hut",
    "tourism/yes",
-    # Public transport (from NaPTAN instead)
+    # Public transport (from NaPTAN instead). public_transport/platform is the
+    # EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
+    # authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
+    # NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
+    # a single stop. stop_position is left dropped to avoid double-counting the
+    # same stop (platform + stop_position).
    "public_transport/entrance",
-    "public_transport/platform",
    "public_transport/station",
    "public_transport/stop_position",
    # Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🌳",
        [
            "leisure/park",
+            # leisure/garden is dominated by private residential gardens (98%+
+            # unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
+            # so only named (public/notable) gardens count as a Park.
            "leisure/garden",
            "leisure/common",
            "leisure/nature_reserve",
            "leisure/dog_park",
-            "leisure/bandstand",
-            "leisure/bird_hide",
-            "leisure/firepit",
-            "leisure/outdoor_seating",
-            "leisure/picnic_table",
-            "leisure/wildlife_hide",
        ],
    ),
    (
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "leisure/sports_centre",
            "leisure/sports_hall",
+            # leisure/pitch (73% of the old bucket) and leisure/swimming_pool
+            # (98% unnamed = private/garden pools) are name-gated in transform()
+            # via REQUIRE_NAME_CATEGORIES so only named public facilities count.
            "leisure/pitch",
            "leisure/track",
            "leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "amenity/townhall",
        ],
    ),
+    # ── Public transport (OSM supplement to NaPTAN) ──────────
+    # OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
+    # / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
+    # one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
+    # transform() (osm_stops_near_naptan).
+    (
+        "Public Transport",
+        "Bus stop",
+        "🚏",
+        [
+            "public_transport/platform",
+        ],
+    ),
 ]

+# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
+# These tags are overwhelmingly private/incidental when unnamed: a nameless
+# `leisure/garden` is a private residential garden (not a public park), and a
+# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
+# Keeping only named instances stops them inflating Park / Sports Centre counts
+# while preserving genuinely public, notable facilities (which carry a name).
+REQUIRE_NAME_CATEGORIES = {
+    "leisure/garden",
+    "leisure/pitch",
+    "leisure/practice_pitch",
+    "leisure/swimming_pool",
+    "leisure/paddling_pool",
+}
+
+
 # Build flat lookup: OSM category → (group, friendly_name, emoji)
 CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
    osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    )


-def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
+def transform_gias_schools(
+    gias_path: Path, ofsted_path: Path, boundary_path: Path
+) -> pl.LazyFrame:
    """Convert the GIAS register parquet into POI rows with school metadata.
    Ofsted ratings are joined by URN so each school carries its latest OEIF
    overall effectiveness grade (Outstanding/Good/Requires improvement/
-    Inadequate/Not judged), surfaced in the map popup."""
+    Inadequate/Not judged), surfaced in the map popup.
+
+    Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
+    GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
+    England-only Education layer (and depress apparent Ofsted coverage, since
+    Wales is inspected by Estyn, not Ofsted)."""
    icon_category_expr = _school_icon_category_expr()
    emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
    ofsted = _load_ofsted_ratings(ofsted_path)
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
-    return (
+    schools = (
        pl.scan_parquet(gias_path)
        .join(ofsted, on="urn", how="left")
        .select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
            pl.col("head_name").alias("school_head_name"),
            pl.col("ofsted_rating").alias("school_ofsted_rating"),
        )
+        .collect()
    )
+    mask = in_england_mask(
+        boundary_path,
+        schools["lat"].to_numpy(),
+        schools["lng"].to_numpy(),
+    )
+    return schools.filter(pl.Series(mask)).lazy()


 # OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
    return tokens


+# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
+# gaps. Where NaPTAN already has a stop within this radius the area is covered,
+# so the colocated OSM platform is dropped to avoid double-counting; OSM
+# platforms with no nearby NaPTAN stop (the gaps) are kept.
+BUS_STOP_DEDUP_RADIUS_M = 50.0
+
+
+def osm_stops_near_naptan(
+    osm_stops: pl.DataFrame,
+    naptan_stops: pl.DataFrame,
+    radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
+) -> list[str]:
+    """Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
+
+    Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
+    a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
+    NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
+    """
+    if osm_stops.is_empty() or naptan_stops.is_empty():
+        return []
+
+    from scipy.spatial import cKDTree
+
+    n_lat = naptan_stops["lat"].to_numpy().astype(float)
+    n_lng = naptan_stops["lng"].to_numpy().astype(float)
+    o_lat = osm_stops["lat"].to_numpy().astype(float)
+    o_lng = osm_stops["lng"].to_numpy().astype(float)
+    o_ids = osm_stops["id"].to_list()
+
+    mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
+    cos_lat = float(np.cos(np.radians(mean_lat)))
+    n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
+    o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
+
+    tree = cKDTree(n_xy)
+    dist, _ = tree.query(o_xy, k=1)
+    return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
+
+
 def osm_groceries_colocated_with_geolytix(
    osm_groceries: pl.DataFrame,
    geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
    # Drop unwanted categories
    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))

+    # Drop UNNAMED instances of private-dominated tags (gardens, pitches,
+    # pools) so they don't inflate Park / Sports Centre proximity counts. Done
+    # while `category` still holds the raw OSM key, before the friendly mapping.
+    lf = lf.filter(
+        ~(
+            pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
+            & (
+                pl.col("name").is_null()
+                | (pl.col("name").cast(pl.String).str.strip_chars() == "")
+            )
+        )
+    )
+
    # Build lookup expressions from the 3-tuple mapping
    group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
    name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
        )

+    # Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
+    # supplement only adds stops in NaPTAN's coverage gaps (no double-count in
+    # covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
+    # with NaPTAN ATCO ids.
+    osm_bus_stops = (
+        lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
+        .select("id", "lat", "lng")
+        .collect(engine="streaming")
+    )
+    naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
+    covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
+    kept_osm = osm_bus_stops.height - len(covered_bus_ids)
+    print(
+        f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
+        f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
+        f"{kept_osm:,} to fill NaPTAN gaps"
+    )
+    if covered_bus_ids:
+        lf = lf.filter(
+            ~(
+                (pl.col("group") == "Public Transport")
+                & (pl.col("category") == "Bus stop")
+                & pl.col("id").is_in(covered_bus_ids)
+            )
+        )
+
    frames = [
        lf,
        naptan,
        grocery_pois.lazy(),
-        transform_gias_schools(gias_path, ofsted_path),
+        transform_gias_schools(gias_path, ofsted_path, boundary_path),
    ]

    return pl.concat(frames, how="diagonal_relaxed")
--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -10,6 +10,26 @@ EARTH_RADIUS_KM = 6371.0088
 KM_PER_DEGREE_LAT = 111.32
 DEFAULT_GRID_SIZE_DEGREES = 0.02

+# Generous GB/UK bounding box. The ArcGIS postcode source stores grid-less
+# postcodes with a placeholder coordinate (lat=99.999999, lon=0.0); these are
+# finite, so an isfinite() check alone lets them through and produces absurd
+# ~5,000 km "nearest amenity" distances. Reject anything outside this box so
+# such postcodes get NaN distance / zero counts instead of a fabricated value.
+UK_LAT_MIN, UK_LAT_MAX = 49.0, 61.5
+UK_LON_MIN, UK_LON_MAX = -9.0, 2.5
+
+
+def valid_uk_coords_mask(lats: np.ndarray, lons: np.ndarray) -> np.ndarray:
+    """Boolean mask of coordinates that are finite AND within the UK bbox."""
+    return (
+        np.isfinite(lats)
+        & np.isfinite(lons)
+        & (lats >= UK_LAT_MIN)
+        & (lats <= UK_LAT_MAX)
+        & (lons >= UK_LON_MIN)
+        & (lons <= UK_LON_MAX)
+    )
+

 def _build_poi_grid(
    pois: pl.DataFrame, grid_size: float = 0.05
@ -43,7 +63,12 @@ def _get_nearby_indices(
    grid_size: float = DEFAULT_GRID_SIZE_DEGREES,
 ) -> np.ndarray | None:
    """Get POI indices from all grid cells intersecting the radius bounding box."""
-    if not np.isfinite(pc_lat) or not np.isfinite(pc_lon):
+    if (
+        not np.isfinite(pc_lat)
+        or not np.isfinite(pc_lon)
+        or not (UK_LAT_MIN <= pc_lat <= UK_LAT_MAX)
+        or not (UK_LON_MIN <= pc_lon <= UK_LON_MAX)
+    ):
        return None

    lat_delta = radius_km / KM_PER_DEGREE_LAT
@ -182,7 +207,7 @@ def min_distance_per_postcode(
    pc_lats = postcodes_df["lat"].to_numpy()
    pc_lons = postcodes_df["lon"].to_numpy()
    pc_codes = postcodes_df["postcode"].to_list()
-    valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
+    valid_pc_mask = valid_uk_coords_mask(pc_lats, pc_lons)
    valid_pc_indices = np.flatnonzero(valid_pc_mask)

    result_min_dist = {