idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -95,11 +95,14 @@ def transform_crime(
        f"({valid_months[0]} to {valid_months[-1]})"
    )

-    # Count monthly incidents, then annualise over every valid month in the dataset.
-    # `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
-    # into N 2021 LSOAs contribute 1/N of their count to each child, since we
-    # don't know which child a given incident actually belonged to.
-    yearly_counts = (
+    # Annualise each year separately (count_in_year * 12 / months_in_year), then
+    # take the simple mean of those per-year rates over the years each type is
+    # present. This makes the headline equal the average of the by-year chart bars
+    # (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring
+    # crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021
+    # lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count
+    # to each child, since we don't know which child an incident actually belonged to.
+    filtered = (
        df.filter(
            valid_month_expr
            & pl.col("LSOA code").is_not_null()
@ -107,15 +110,31 @@ def transform_crime(
            & pl.col("Crime type").is_not_null()
            & (pl.col("Crime type") != "")
        )
-        .with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
-        .group_by("LSOA code", "Month", "Crime type")
-        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
-        .group_by("LSOA code", "Crime type")
-        .agg(
-            (pl.col("count").sum() / pl.lit(valid_month_count) * 12)
-            .round(1)
-            .alias("yearly_avg")
+        .with_columns(
+            pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
+            pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
        )
+    )
+
+    # Months observed *anywhere* in the dataset for each year (annualisation
+    # denominator), matching the by-year output's per-year scaling.
+    months_per_year = filtered.group_by("year").agg(
+        pl.col("Month").n_unique().alias("months_in_year")
+    )
+
+    yearly_counts = (
+        filtered.group_by("LSOA code", "year", "Crime type", "Month")
+        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
+        .group_by("LSOA code", "year", "Crime type")
+        .agg(pl.col("count").sum().alias("count"))
+        .join(months_per_year, on="year")
+        .with_columns(
+            (pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
+        )
+        # Mean of the per-year annualised rates over the years the type is present
+        # (only years with rows are grouped here, so this is the correct x-span).
+        .group_by("LSOA code", "Crime type")
+        .agg(pl.col("per_year").mean().round(1).alias("yearly_avg"))
        .collect(engine="streaming")
    )
    if yearly_counts.is_empty():
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -259,11 +259,14 @@ def _write_avg_yr(
    """
    months = np.array([months_in_year[year] for year in years], dtype=np.float64)
    per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
-    # Average over the years each type is actually observed anywhere -- the same
-    # per-type x-span the by-year chart plots (server-rs/.../crime_by_year.rs).
-    type_year_present = counts.sum(axis=0) > 0  # (n_types, n_years)
-    years_per_type = np.clip(type_year_present.sum(axis=1), 1, None).astype(np.float64)
-    avg = per_year.sum(axis=2) / years_per_type[None, :]  # (n_postcodes, n_types)
+    # Average over the years *this postcode* actually has incidents of *this
+    # type* -- the same per-(postcode, type) x-span the by-year chart plots
+    # (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
+    # by-year bars. Dividing by a global years-present count (years a type
+    # appeared anywhere in England) would deflate postcodes whose incidents
+    # cluster in only a few years of the ~13-year window.
+    years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
+    avg = per_year.sum(axis=2) / years_present  # (n_postcodes, n_types)
    avg = np.round(avg * norm[:, None], 1).astype(np.float32)

    data: dict[str, np.ndarray] = {"postcode": postcodes}
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -365,6 +365,16 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
    }
    duration_map = {"F": "Freehold", "L": "Leasehold"}

+    # price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
+    # VALUE-QUALITY filters: they gate the price aggregations only. Category B
+    # entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
+    # sales must not pollute latest_price / historical_prices (and the downstream
+    # price-per-sqm feature), but they MUST still count for first_transfer_date /
+    # old_new so a new-build's genuine earliest transfer year is preserved.
+    price_ok = pl.col("price") >= MIN_PRICE
+    category_ok = pl.col("ppd_category") == "A"
+    quality_ok = price_ok & category_ok
+
    price_paid = (
        pl.scan_parquet(price_paid_path)
        .select(
@ -381,9 +391,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            "town_city",
            pl.col("duration").replace(duration_map),
            "old_new",
+            "ppd_category",
        )
        .filter(pl.col("pp_property_type") != "Other")
-        .filter(pl.col("price") >= MIN_PRICE)
        .with_columns(
            pl.concat_str(
                [pl.col("saon"), pl.col("paon"), pl.col("street")],
@ -408,18 +418,26 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            pl.col("postcode").last(),
            pl.col("_pp_match_address").last(),
            pl.col("_pp_match_postcode").last(),
+            # Price aggregations are restricted to quality-passing sales.
            pl.struct(
                pl.col("date_of_transfer").dt.year().alias("year"),
                pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
                "price",
-            ).alias("historical_prices"),
+            )
+            .filter(quality_ok)
+            .alias("historical_prices"),
            pl.col("pp_property_type").last(),
            pl.col("duration").last(),
-            pl.col("price").last().alias("latest_price"),
-            pl.col("date_of_transfer").last(),
+            pl.col("price").filter(quality_ok).last().alias("latest_price"),
+            pl.col("date_of_transfer").filter(quality_ok).last(),
+            # first_transfer_date / old_new reflect the genuine earliest transfer
+            # over the full per-group transaction stream (not value-filtered).
            pl.col("date_of_transfer").first().alias("first_transfer_date"),
            pl.col("old_new").first(),
        )
+        # Preserve the property universe: previously a property needed >=1 sale
+        # >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
+        .filter(pl.col("latest_price").is_not_null())
    )

    print("Price paid dataset")
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -839,25 +839,36 @@ def _join_area_side_tables(
    # Crime is counted spatially per postcode (incidents within 50m of the
    # postcode boundary), so it joins on postcode rather than LSOA.
    base = base.join(crime, on="postcode", how="left")
+    serious_crime_cols = [
+        "Violence and sexual offences (avg/yr)",
+        "Robbery (avg/yr)",
+        "Burglary (avg/yr)",
+        "Possession of weapons (avg/yr)",
+    ]
+    minor_crime_cols = [
+        "Anti-social behaviour (avg/yr)",
+        "Criminal damage and arson (avg/yr)",
+        "Shoplifting (avg/yr)",
+        "Bicycle theft (avg/yr)",
+        "Theft from the person (avg/yr)",
+        "Other theft (avg/yr)",
+        "Vehicle crime (avg/yr)",
+        "Public order (avg/yr)",
+        "Drugs (avg/yr)",
+        "Other crime (avg/yr)",
+    ]
+    # The LEFT join leaves every per-type column null for postcodes absent from
+    # the crime table; sum_horizontal alone would fabricate a "zero crime"
+    # rollup there, so keep the rollup null when ALL components are null.
    base = base.with_columns(
-        pl.sum_horizontal(
-            "Violence and sexual offences (avg/yr)",
-            "Robbery (avg/yr)",
-            "Burglary (avg/yr)",
-            "Possession of weapons (avg/yr)",
-        ).alias("serious_crime_avg_yr"),
-        pl.sum_horizontal(
-            "Anti-social behaviour (avg/yr)",
-            "Criminal damage and arson (avg/yr)",
-            "Shoplifting (avg/yr)",
-            "Bicycle theft (avg/yr)",
-            "Theft from the person (avg/yr)",
-            "Other theft (avg/yr)",
-            "Vehicle crime (avg/yr)",
-            "Public order (avg/yr)",
-            "Drugs (avg/yr)",
-            "Other crime (avg/yr)",
-        ).alias("minor_crime_avg_yr"),
+        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
+        .then(None)
+        .otherwise(pl.sum_horizontal(serious_crime_cols))
+        .alias("serious_crime_avg_yr"),
+        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
+        .then(None)
+        .otherwise(pl.sum_horizontal(minor_crime_cols))
+        .alias("minor_crime_avg_yr"),
    )

    base = base.join(median_age, on="lsoa21", how="left")
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
    # pages); tolerate its absence so older parquets and test fixtures still
    # load. Digits-only so it compares equal to the EPC register's UPRN.
    if "UPRN" in raw.collect_schema().names():
-        uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
+        # Mirror `_normalize_uprn` exactly so the listing key compares equal to
+        # the candidate-side key for every dtype. For a Float UPRN we must
+        # stringify via its integer form (100023336956.0 -> "100023336956"),
+        # otherwise stripping non-digits from "100023336956.0" yields a bogus
+        # trailing-zero key ("1000233369560") that never collides; and a
+        # non-integral float (e.g. 1.5) must be rejected rather than mangled.
+        uprn_col = pl.col("UPRN")
+        if raw.collect_schema()["UPRN"].is_float():
+            integral = uprn_col.cast(pl.Int64, strict=False)
+            uprn_digits = (
+                pl.when(integral == uprn_col)
+                .then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
+                .otherwise(None)
+            )
+        else:
+            uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
        listing_uprn_expr = (
            pl.when(uprn_digits.str.len_chars() > 0)
            .then(uprn_digits)
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(


 def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
+    def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
+        coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
+        # The raw property-level value is fill_null("No") upstream, so a plain
+        # coalesce lets a non-null "No" override a directly-matched listing
+        # "Yes". "Former council house" should fire if EITHER side says so.
+        if raw_column == "was_council_house":
+            return (
+                pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
+                .then(pl.lit("Yes"))
+                .otherwise(coalesce)
+                .alias(raw_column)
+            )
+        return coalesce.alias(raw_column)
+
    return wide.with_columns(
        [
-            pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
+            _coalesced(raw_column, direct_column)
            for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
        ]
    )
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_

 # POI category groups for proximity counting (2km radius).
 # Names must match the friendly names produced by transform_poi.py / naptan.py.
+# "groceries" is filled in dynamically by _groceries_categories() because the
+# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
+# than the literal "Supermarket"; counting only the OSM strings here severely
+# understates the metric. See _groceries_categories below.
 POI_GROUPS_2KM = {
    "restaurants": ["Restaurant", "Fast Food"],
-    "groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
 }

+# POI group whose members are counted for the static "groceries" 2km metric.
+# Covers both the OSM grocery categories (Supermarket, Convenience Store,
+# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
+GROCERIES_GROUP = "Groceries"
+
 # OS Open Greenspace function types used for park counts and distance calculation.
 # Uses the authoritative OS dataset instead of OSM point POIs for better coverage
 # of green spaces that are only mapped as polygons in OSM.
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
    return slug or "poi"


+def _groceries_categories(pois: pl.DataFrame) -> list[str]:
+    """Return the distinct `category` values for the Groceries group.
+
+    `count_pois_per_postcode` matches POIs on `category`, but the authoritative
+    GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
+    with group "Groceries"; it never emits the literal "Supermarket". Collecting
+    every Groceries category captures both the OSM strings and the brand names.
+    """
+    if "group" not in pois.columns:
+        raise ValueError("POI dataframe must include a 'group' column")
+    return (
+        pois.filter(pl.col("group") == GROCERIES_GROUP)
+        .select("category")
+        .unique()
+        .sort("category")
+        .to_series()
+        .to_list()
+    )
+
+
 def _build_poi_category_groups(
    pois: pl.DataFrame,
 ) -> tuple[dict[str, list[str]], dict[str, str]]:
@ -122,9 +150,15 @@ def main():
    pois = pl.read_parquet(args.pois)
    poi_category_groups, poi_display_names = _build_poi_category_groups(pois)

-    # Count static amenity groups within 2km.
+    # Count static amenity groups within 2km. "groceries" is matched against
+    # every Groceries category (OSM strings + GEOLYTIX brand names) so that
+    # postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
+    groups_2km = {
+        **POI_GROUPS_2KM,
+        "groceries": _groceries_categories(pois),
+    }
    counts_2km = count_pois_per_postcode(
-        postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
+        postcodes, pois, groups=groups_2km, radius_km=2
    )

    # Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
--- a/pipeline/transform/postcode_boundaries/README.md
+++ b/pipeline/transform/postcode_boundaries/README.md
@ -77,9 +77,9 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen

 ### Phase 4: Merging and writing

-**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
+**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps the largest part **plus any other part ≥ `_MIN_DETACHED_PART_AREA` (100 m²)** (`_keep_polygon_parts`); only sub-100 m² noise slivers are dropped. Keeping substantial detached parts matters because a postcode genuinely split across an OA seam (by a railway, river, or main road wider than the 5m buffer) would otherwise lose a chunk — measured at ~1.8% of merged area left as uncovered gaps (often 3000–5000 m² building blocks) before this change.

-**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
+**GeoJSON output** (`output.py:write_district_geojson`): Two passes. Pass 1 converts every postcode from BNG to WGS84 (pyproj), simplifies with 1m tolerance (Douglas-Peucker), and snaps to 6 decimal places (~0.1m precision); multi-part postcodes become `MultiPolygon` (`to_wgs84_geojson_multi`, each part handled independently), single-part stay `Polygon`. The whole set is then made a **partition** (`_resolve_overlaps`): each postcode is trimmed by the union of its higher-priority overlapping neighbours, where **priority = ascending area** (smaller postcodes win contested ground). That single rule handles both seam overlap *and* containment — an enclosed postcode is always smaller than its container, so it keeps its area while the container gets a hole (the query uses both the `overlaps` and `contains` predicates, since `overlaps` alone excludes containment). This runs last, so nothing re-introduces overlap; a postcode that would be emptied keeps its original geometry, so no active postcode is dropped. Pass 2 groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`), rounds coordinates to 6dp, and writes a `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.

 ## Memory architecture

@ -103,10 +103,10 @@ Key design choices:

 ## Key invariants

-1. **Every square meter of every OA is assigned to exactly one postcode** — the combination of INSPIRE claiming + Voronoi fills the entire OA, and overlap resolution ensures no double-counting
+1. **No two postcodes cover the same ground in the output** — within an OA the INSPIRE claiming + Voronoi tile it with no overlap, and a final `_resolve_overlaps` partition pass removes the thin overlap strips that the merge buffer + per-postcode simplification introduce across OA seams (measured residual overlap ~0.01% of area)
 2. **Every postcode that exists in the UPRN data gets a polygon** — unless all its UPRNs share coordinates with another postcode's UPRNs (handled by jitter) or it has zero UPRNs
 3. **Postcode polygons never extend outside their OA(s)** — all geometry is clipped to OA boundaries
-4. **Output is always single Polygon, never MultiPolygon** — the largest-polygon extraction in both `merge_fragments` and `to_wgs84_geojson` ensures this
+4. **A postcode split across an OA seam keeps all its substantial parts** — `merge_fragments` keeps every part ≥ 100 m² and the output is emitted as a `MultiPolygon` (the Rust server `postcodes.rs` and `loader.py` both parse MultiPolygon); only sub-100 m² noise slivers are dropped

 ## Module structure

--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -1,12 +1,21 @@
 import argparse
+import multiprocessing as mp
+import os
 from pathlib import Path

+import numpy as np
+import shapely
 from shapely.geometry import MultiPolygon, Polygon
 from tqdm import tqdm

+from .fragments_cache import (
+    fragments_cache_is_fresh,
+    load_fragments,
+    save_fragments,
+)
 from .inspire import (
+    build_inspire_index,
    cache_inspire,
-    get_inspire_candidates,
    inspire_cache_exists,
    load_inspire,
 )
@ -14,7 +23,206 @@ from .memory import release_memory
 from .oa_boundaries import load_oa_boundaries
 from .output import merge_fragments, write_district_geojson
 from .process_oa import process_oa
-from .uprn import get_oa_uprns, load_uprns
+from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
+
+Fragment = tuple[str, Polygon | MultiPolygon]
+
+
+def _oa_fragments(
+    oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
+) -> tuple[list[Fragment], bool]:
+    """Process one OA into ``(postcode, geometry)`` fragments.
+
+    Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
+    fast path. Shared by the sequential and parallel drivers so both produce
+    identical output. Any failure is re-raised tagged with the OA code so a single
+    bad OA is attributable instead of an anonymous worker abort hours in.
+    """
+    try:
+        oa_geom = oa_geoms[oa_code]
+        points, postcodes = get_oa_uprns_arrays(
+            east, north, postcodes_arr, offsets, oa_code
+        )
+        if len(set(postcodes)) == 1:
+            return [(postcodes[0], oa_geom)], True
+        candidates = index.candidates(oa_geom.bounds)
+        return process_oa(oa_geom, points, postcodes, candidates), False
+    except Exception as exc:
+        raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
+
+
+# Worker-shared state. Populated in the parent before the pool forks; children
+# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
+# never duplicated per worker). Read-only in workers.
+_WORKER_STATE: dict = {}
+
+
+def _process_oa_chunk(oa_codes: list[str]):
+    """Worker: turn a chunk of OA codes into WKB-encoded fragments.
+
+    Geometries are returned as WKB (compact and lossless) rather than pickled
+    Shapely objects, to keep the IPC payload small.
+    """
+    state = _WORKER_STATE
+    frags: list[Fragment] = []
+    single = 0
+    for oa_code in oa_codes:
+        oa_frags, is_single = _oa_fragments(
+            oa_code,
+            state["oa_geoms"],
+            state["east"],
+            state["north"],
+            state["postcodes"],
+            state["offsets"],
+            state["index"],
+        )
+        frags.extend(oa_frags)
+        single += is_single
+
+    if frags:
+        pcs = [pc for pc, _ in frags]
+        wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
+    else:
+        pcs, wkb = [], np.empty(0, dtype=object)
+    return pcs, wkb, single, len(oa_codes)
+
+
+def _resolve_workers(requested: int) -> int:
+    """Worker count: the explicit value if >0, otherwise all available CPUs."""
+    if requested and requested > 0:
+        return requested
+    try:
+        return max(1, len(os.sched_getaffinity(0)))
+    except AttributeError:
+        return max(1, os.cpu_count() or 1)
+
+
+def _process_oas(
+    oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
+) -> tuple[list[Fragment], int]:
+    """Drive Phase 3 over every OA, fanning out across `workers` processes.
+
+    OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
+    share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
+    geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
+    does not affect the result (``merge_fragments`` unions per postcode), so
+    chunks are collected as they finish. Returns ``(fragments, single_count)``.
+    """
+    all_fragments: list[Fragment] = []
+    single_count = 0
+
+    if workers <= 1 or "fork" not in mp.get_all_start_methods():
+        for oa_code in tqdm(
+            oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
+        ):
+            oa_frags, is_single = _oa_fragments(
+                oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
+            )
+            all_fragments.extend(oa_frags)
+            single_count += is_single
+        return all_fragments, single_count
+
+    _WORKER_STATE.update(
+        oa_geoms=oa_geoms,
+        east=east,
+        north=north,
+        postcodes=postcodes_arr,
+        offsets=offsets,
+        index=index,
+    )
+    # Many small contiguous chunks → dynamic load balancing across workers (rural
+    # OAs cost far more than urban ones) while preserving mmap read locality.
+    chunk_size = max(1, len(oa_codes) // (workers * 16))
+    chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
+    print(f"  Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
+
+    ctx = mp.get_context("fork")
+    try:
+        with ctx.Pool(processes=workers) as pool:
+            with tqdm(
+                total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
+            ) as bar:
+                for pcs, wkb, single, n_oas in pool.imap_unordered(
+                    _process_oa_chunk, chunks
+                ):
+                    if len(wkb):
+                        all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
+                    single_count += single
+                    bar.update(n_oas)
+    finally:
+        # Drop references so Phase 4 doesn't keep the big inputs alive.
+        _WORKER_STATE.clear()
+    return all_fragments, single_count
+
+
+def build_fragments(args: argparse.Namespace) -> list[Fragment]:
+    """Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
+
+    Returns the full ``(postcode, geometry)`` fragment list. The large
+    intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
+    freed as soon as this function returns -- before the fragments are cached or
+    merged.
+    """
+    # Phase 1: Load all data
+    print("=" * 60)
+    print("Phase 1: Loading data")
+    print("=" * 60)
+
+    oa_geoms = load_oa_boundaries(args.oa_boundaries)
+    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
+    # Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
+    # call polars (avoids the fork-after-threads hazard of its rayon pool).
+    uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
+
+    # Phase 2: Parse/load INSPIRE
+    print()
+    print("=" * 60)
+    print("Phase 2: INSPIRE data")
+    print("=" * 60)
+
+    inspire_cache_dir = args.output / "inspire_cache"
+    if not inspire_cache_exists(inspire_cache_dir):
+        cache_inspire(args.inspire, inspire_cache_dir)
+    inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
+    inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
+
+    # Phase 3: Process OAs
+    print()
+    print("=" * 60)
+    print("Phase 3: Processing OAs")
+    print("=" * 60)
+
+    # Build work list — precompute which OAs are single vs multi-postcode
+    oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
+    skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
+    skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
+
+    if args.limit > 0:
+        oa_codes_with_data = oa_codes_with_data[: args.limit]
+
+    print(f"  OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
+    print(f"  Skipped (no UPRNs): {skipped_no_uprn}")
+    print(f"  Skipped (no boundary): {skipped_no_boundary}")
+
+    # --limit is a debug mode → force deterministic single-process.
+    workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
+    all_fragments, single_count = _process_oas(
+        oa_codes_with_data,
+        oa_geoms,
+        uprn_east,
+        uprn_north,
+        uprn_postcodes,
+        uprn_offsets,
+        inspire_index,
+        workers,
+    )
+    multi_count = len(oa_codes_with_data) - single_count
+
+    print(f"\n  Single-postcode OAs (fast path): {single_count}")
+    print(f"  Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
+    print(f"  Total fragments: {len(all_fragments)}")
+
+    return all_fragments


 def main() -> None:
@ -38,6 +246,12 @@ def main() -> None:
    parser.add_argument(
        "--limit", type=int, default=0, help="Process only first N OAs (0=all)"
    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=0,
+        help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
+    )
    parser.add_argument(
        "--greenspace",
        type=Path,
@ -46,79 +260,30 @@ def main() -> None:
    )
    args = parser.parse_args()

-    # Phase 1: Load all data
-    print("=" * 60)
-    print("Phase 1: Loading data")
-    print("=" * 60)
+    fragments_cache = args.output / "fragments_cache.parquet"
+    # Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
+    # so a greenspace change must not invalidate the fragment cache.
+    fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
+    # --limit yields a partial fragment set; never read or write the shared cache.
+    use_cache = args.limit == 0

-    oa_geoms = load_oa_boundaries(args.oa_boundaries)
-    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
-
-    # Phase 2: Parse/load INSPIRE
-    print()
-    print("=" * 60)
-    print("Phase 2: INSPIRE data")
-    print("=" * 60)
-
-    inspire_cache_dir = args.output / "inspire_cache"
-    if not inspire_cache_exists(inspire_cache_dir):
-        cache_inspire(args.inspire, inspire_cache_dir)
-    inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
-
-    # Phase 3: Process OAs
-    print()
-    print("=" * 60)
-    print("Phase 3: Processing OAs")
-    print("=" * 60)
-
-    # Build work list — precompute which OAs are single vs multi-postcode
-    oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
-    skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
-    skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
-
-    if args.limit > 0:
-        oa_codes_with_data = oa_codes_with_data[: args.limit]
-
-    print(f"  OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
-    print(f"  Skipped (no UPRNs): {skipped_no_uprn}")
-    print(f"  Skipped (no boundary): {skipped_no_boundary}")
-
-    all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
-    single_count = 0
-    multi_count = 0
-
-    for oa_code in tqdm(
-        oa_codes_with_data,
-        desc="Processing OAs",
-        unit="OA",
-        smoothing=0.01,
-        miniters=100,
-    ):
-        oa_geom = oa_geoms[oa_code]
-        points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
-
-        if len(set(postcodes)) == 1:
-            # Fast path: entire OA = one postcode
-            all_fragments.append((postcodes[0], oa_geom))
-            single_count += 1
-            continue
-
-        # Get INSPIRE candidates via bbox pre-filter
-        candidates = get_inspire_candidates(
-            oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
+    if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
+        print("=" * 60)
+        print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
+        print("=" * 60)
+        all_fragments = load_fragments(fragments_cache)
+        print(
+            f"  Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
        )
+    else:
+        all_fragments = build_fragments(args)
+        if use_cache:
+            # Persist the expensive Phase-3 output before the cheap-but-fragile
+            # merge/write so any failure there resumes in seconds, not ~10 hours.
+            save_fragments(fragments_cache, all_fragments)
+            print(f"  Cached {len(all_fragments):,} fragments to {fragments_cache}")

-        fragments = process_oa(oa_geom, points, postcodes, candidates)
-        all_fragments.extend(fragments)
-        multi_count += 1
-
-    print(f"\n  Single-postcode OAs (fast path): {single_count}")
-    print(f"  Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
-    print(f"  Total fragments: {len(all_fragments)}")
-
-    # Free data no longer needed
-    del oa_geoms, uprn_df, uprn_offsets
-    del inspire_bboxes, inspire_offsets, inspire_coords
+    # Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
    release_memory()

    # Phase 4: Merge and write
@ -145,6 +310,12 @@ def main() -> None:

    file_count = write_district_geojson(merged, args.output)
    print(f"\n  Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
+
+    # The cache exists only to survive a crash between Phase 3 and a clean write.
+    # Now that the output is complete, drop it so a later input change can never
+    # be served from a stale cache.
+    if use_cache:
+        fragments_cache.unlink(missing_ok=True)
    print("Done!")


--- a/pipeline/transform/postcode_boundaries/inspire.py
+++ b/pipeline/transform/postcode_boundaries/inspire.py
@ -112,44 +112,130 @@ def load_inspire(
    return bboxes, offsets, coords_mmap


-def get_inspire_candidates(
-    oa_bounds: tuple[float, float, float, float],
+# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
+# and the 99th percentile ~540 m, so almost every parcel fits inside a single
+# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
+# query.
+_GRID_CELL_SIZE = 1000.0
+
+
+class InspireIndex:
+    """Uniform-grid spatial index over INSPIRE parcel bounding boxes.
+
+    The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
+    OA, ~4 h total over the country). This indexes parcels by grid cell so each
+    lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
+    bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
+    by cell id, located with ``searchsorted``); the few parcels larger than a
+    cell are kept in an overflow array tested directly on every query. An exact
+    bbox test then runs on the gathered subset and the result is sorted, so the
+    candidate set -- and its order -- is byte-for-byte identical to the old scan.
+    """
+
+    def __init__(
+        self,
+        bboxes: np.ndarray,
+        offsets: np.ndarray,
+        coords_mmap: np.memmap,
+        cell_size: float = _GRID_CELL_SIZE,
+    ) -> None:
+        self._bboxes = bboxes
+        self._offsets = offsets
+        self._coords = coords_mmap
+        self._cell_size = cell_size
+        self._origin_x = float(bboxes[:, 0].min())
+        self._origin_y = float(bboxes[:, 1].min())
+        # Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
+        # query's one-cell low-edge widening can never collide with cx-1.
+        self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
+
+        width = bboxes[:, 2] - bboxes[:, 0]
+        height = bboxes[:, 3] - bboxes[:, 1]
+        small = np.where((width <= cell_size) & (height <= cell_size))[0]
+        self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
+        self._oversized_bb = bboxes[self._oversized]
+
+        cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
+        cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
+        cell_id = cx * self._ny + cy
+        order = np.argsort(cell_id, kind="stable")
+        self._sorted_cells = cell_id[order]
+        self._cell_parcels = small[order]
+
+    def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
+        """Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
+        min_e, min_n, max_e, max_n = oa_bounds
+        cs = self._cell_size
+        # A small parcel (<= one cell) overlapping the OA has its min-corner no
+        # more than one cell below/left of the OA bbox, so widen the low edges by
+        # a cell. This keeps the lookup free of false negatives.
+        gx0 = int((min_e - cs - self._origin_x) // cs)
+        gx1 = int((max_e - self._origin_x) // cs)
+        gy_lo = int((min_n - cs - self._origin_y) // cs)
+        gy_hi = int((max_n - self._origin_y) // cs)
+
+        parts = []
+        ob = self._oversized_bb
+        if len(ob):
+            mo = (
+                (ob[:, 2] >= min_e)
+                & (ob[:, 0] <= max_e)
+                & (ob[:, 3] >= min_n)
+                & (ob[:, 1] <= max_n)
+            )
+            if mo.any():
+                parts.append(self._oversized[mo])
+
+        for gx in range(gx0, gx1 + 1):
+            base = gx * self._ny
+            lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
+            hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
+            if hi > lo:
+                parts.append(self._cell_parcels[lo:hi])
+
+        if not parts:
+            return np.empty(0, dtype=np.int64)
+        cand = np.concatenate(parts)
+        cb = self._bboxes[cand]
+        mask = (
+            (cb[:, 2] >= min_e)
+            & (cb[:, 0] <= max_e)
+            & (cb[:, 3] >= min_n)
+            & (cb[:, 1] <= max_n)
+        )
+        # Sort so the candidate order matches the old full np.where scan exactly.
+        return np.sort(cand[mask])
+
+    def candidates(
+        self, oa_bounds: tuple[float, float, float, float]
+    ) -> list[Polygon]:
+        """INSPIRE polygons overlapping an OA, built from the mmap on demand.
+
+        Builds Shapely objects only for matches (typically 10-500 per OA).
+        """
+        candidates = []
+        for i in self.candidate_indices(oa_bounds):
+            byte_offset = self._offsets[i, 0]
+            n_pts = self._offsets[i, 1]
+            float_offset = byte_offset // 8  # float64 = 8 bytes
+            coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
+            poly = Polygon(coords)
+            if not poly.is_valid:
+                poly = make_valid(poly)
+                if poly.geom_type == "MultiPolygon":
+                    poly = max(poly.geoms, key=lambda g: g.area)
+                elif poly.geom_type != "Polygon":
+                    continue
+            if not poly.is_empty:
+                candidates.append(poly)
+        return candidates
+
+
+def build_inspire_index(
    bboxes: np.ndarray,
    offsets: np.ndarray,
    coords_mmap: np.memmap,
-) -> list[Polygon]:
-    """Get INSPIRE polygons overlapping an OA via bbox pre-filter.
-
-    Builds Shapely objects only for matches (typically 10-500 per OA).
-    Reads coordinate data on-demand from memory-mapped file.
-    """
-    min_e, min_n, max_e, max_n = oa_bounds
-
-    # Vectorized bbox overlap test
-    mask = (
-        (bboxes[:, 2] >= min_e)
-        & (bboxes[:, 0] <= max_e)
-        & (bboxes[:, 3] >= min_n)
-        & (bboxes[:, 1] <= max_n)
-    )
-    idxs = np.where(mask)[0]
-    if len(idxs) == 0:
-        return []
-
-    # Build Shapely polygons only for candidates (coords from mmap)
-    candidates = []
-    for i in idxs:
-        byte_offset = offsets[i, 0]
-        n_pts = offsets[i, 1]
-        float_offset = byte_offset // 8  # float64 = 8 bytes
-        coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
-        poly = Polygon(coords)
-        if not poly.is_valid:
-            poly = make_valid(poly)
-            if poly.geom_type == "MultiPolygon":
-                poly = max(poly.geoms, key=lambda g: g.area)
-            elif poly.geom_type != "Polygon":
-                continue
-        if not poly.is_empty:
-            candidates.append(poly)
-    return candidates
+    cell_size: float = _GRID_CELL_SIZE,
+) -> InspireIndex:
+    """Build the grid spatial index used for per-OA candidate retrieval."""
+    return InspireIndex(bboxes, offsets, coords_mmap, cell_size)
--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -3,8 +3,9 @@ import shutil
 from collections import defaultdict
 from pathlib import Path

+import numpy as np
 from pyproj import Transformer
-from shapely import make_valid, set_precision
+from shapely import STRtree, make_valid, set_precision
 from shapely.errors import GEOSException
 from shapely.geometry import MultiPolygon, Polygon, mapping, shape
 from shapely.ops import transform as transform_geometry
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
    return None


-def to_wgs84_geojson(
-    geom: Polygon | MultiPolygon, tolerance: float = 1.0
-) -> dict | None:
-    """Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
+# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
+# below this in any direction snap to empty during serialization.
+_OUTPUT_PRECISION_DEG = 0.000001
+# Minimal BNG buffer used to rescue sub-grid slivers into a representable
+# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
+# vertices) would otherwise vanish at output precision; since every *active*
+# postcode must keep a boundary (validate_outputs enforces this with zero
+# tolerance), we fatten it just enough to survive snapping rather than drop it.
+_MIN_FOOTPRINT_BUFFER_M = 0.5
+
+
+def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
+    """Transform a BNG polygon to WGS84, snap to output precision, validate.

    Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
    just the intermediate Shapely object: coordinate snapping during
    serialization can otherwise leave a self-intersecting ring that only shows up
-    once the feature is read back from disk. Any such geometry is repaired with
-    ``make_valid`` before returning so written features are always valid.
+    once the feature is read back from disk. Returns ``None`` if the geometry
+    collapses to empty (a sub-grid sliver).
    """
-    geom = _largest_polygonal(geom)
-    if geom is None:
-        return None
-
-    simplified = geom.simplify(tolerance, preserve_topology=True)
-    simplified = _largest_polygonal(simplified)
-    if simplified is None:
-        return None
-
    transformer = _get_to_wgs84()
-    wgs84 = transform_geometry(transformer.transform, simplified)
+    wgs84 = transform_geometry(transformer.transform, geom_bng)
    try:
-        wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
+        wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
    except GEOSException:
        # Precision snapping can fail on pathological geometries; fall back to a
        # plain validity repair without coordinate snapping.
@ -87,20 +88,105 @@ def to_wgs84_geojson(
    return geojson_dict


+def _rescue_footprint(geom_bng) -> dict | None:
+    """Fatten a degenerate BNG geometry into a representable footprint and snap."""
+    footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
+    if footprint is None:
+        return None
+    return _snap_to_wgs84_geojson(footprint)
+
+
+def to_wgs84_geojson(
+    geom: Polygon | MultiPolygon, tolerance: float = 1.0
+) -> dict | None:
+    """Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
+
+    A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
+    output precision. Dropping them would leave an active postcode with no
+    boundary (validate_outputs rejects that with zero tolerance), so instead they
+    are fattened into a minimal footprint at the right location: first by buffering
+    the (often elongated) sliver itself, then -- for fully-degenerate input -- a
+    small disc around ``representative_point()``, which lies inside any non-empty
+    geometry. ``None`` is returned only for a genuinely empty input.
+    """
+    if geom is None or geom.is_empty:
+        return None
+
+    cleaned = _largest_polygonal(geom)
+    if cleaned is not None:
+        simplified = _largest_polygonal(
+            cleaned.simplify(tolerance, preserve_topology=True)
+        )
+        if simplified is None:
+            simplified = cleaned
+        # Normal path; if snapping erases a thin sliver, fatten its real shape.
+        result = _snap_to_wgs84_geojson(simplified)
+        if result is None:
+            result = _rescue_footprint(simplified)
+        if result is not None:
+            return result
+
+    # Universal fallback for input too degenerate to clean or fatten in place.
+    return _rescue_footprint(geom.representative_point())
+
+
+def to_wgs84_geojson_multi(
+    geom: Polygon | MultiPolygon, tolerance: float = 1.0
+) -> dict | None:
+    """Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
+    preserving every part. Each part is simplified/snapped/rescued independently
+    via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
+    a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
+    """
+    parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
+    part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
+    if not part_dicts:
+        return None
+    if len(part_dicts) == 1:
+        return part_dicts[0]
+    return {
+        "type": "MultiPolygon",
+        "coordinates": [pd["coordinates"] for pd in part_dicts],
+    }
+
+
+# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
+# get filled. A hole at least this large is likely a genuinely enclosed postcode
+# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
+# guarantee, this is defence-in-depth.
+_MAX_ARTIFACT_HOLE_AREA = 1000.0
+
+
+def _fill_small_holes(poly: Polygon) -> Polygon:
+    kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
+    return Polygon(poly.exterior, kept)
+
+
 def _fill_holes(geom):
-    """Remove all interior rings (holes) from a polygon or multipolygon."""
+    """Fill small artifact interior rings; keep large (real-enclosed) holes."""
    if geom.geom_type == "Polygon":
-        return Polygon(geom.exterior)
+        return _fill_small_holes(geom)
    elif geom.geom_type == "MultiPolygon":
-        return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
+        return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
    return geom


-def _largest_polygon(geom):
-    """Extract the largest polygon from a MultiPolygon."""
-    if geom.geom_type == "MultiPolygon":
-        return max(geom.geoms, key=lambda g: g.area)
-    return geom
+# A postcode genuinely split across an OA seam (by a railway, river, or main road
+# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
+# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
+# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
+# smaller detached bits are Voronoi/clipping noise and are still dropped.
+_MIN_DETACHED_PART_AREA = 100.0
+
+
+def _keep_polygon_parts(geom):
+    """Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
+    if geom.geom_type != "MultiPolygon":
+        return geom
+    parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
+    if not parts:
+        parts = [max(geom.geoms, key=lambda g: g.area)]
+    return parts[0] if len(parts) == 1 else MultiPolygon(parts)


 def merge_fragments(
@ -126,14 +212,19 @@ def merge_fragments(
            continue
        if not combined.is_valid:
            combined = make_valid(combined)
-        # Close tiny gaps between adjacent OA boundary edges (float mismatches)
+        # Close tiny gaps between adjacent OA boundary edges (float mismatches).
+        # The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
+        # sliver fragment) to nothing, which would leave the postcode with no
+        # geometry at all — keep the un-closed shape if that happens.
        if combined.geom_type == "MultiPolygon":
-            combined = combined.buffer(5.0).buffer(-5.0)
-            if not combined.is_valid:
-                combined = make_valid(combined)
-        # Postcodes are contiguous delivery routes — keep only the largest
-        # polygon; small detached fragments are algorithm artifacts
-        combined = _largest_polygon(combined)
+            closed = combined.buffer(5.0).buffer(-5.0)
+            if not closed.is_valid:
+                closed = make_valid(closed)
+            if not closed.is_empty:
+                combined = closed
+        # Keep the postcode whole: the largest part plus any other substantial
+        # part (a genuine railway/river split), dropping only tiny noise slivers.
+        combined = _keep_polygon_parts(combined)
        # Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
        combined = _fill_holes(combined)
        # Subtract parks/water if provided
@ -142,7 +233,7 @@ def merge_fragments(

            pre_green = combined
            combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
-            combined = _largest_polygon(combined)
+            combined = _keep_polygon_parts(combined)
            # Do NOT _fill_holes here: interior holes carved by the greenspace
            # subtraction (lakes, enclosed parks) are intentional, not artifacts.
            # Filling them would re-add the removed area and negate the
@ -155,10 +246,114 @@ def merge_fragments(
    return merged


+def _polygonal(geom):
+    """Return only the polygonal part(s) of a geometry, or None if none remain."""
+    if geom is None or geom.is_empty:
+        return None
+    if geom.geom_type in ("Polygon", "MultiPolygon"):
+        return geom
+    if geom.geom_type == "GeometryCollection":
+        polys = [
+            g
+            for g in geom.geoms
+            if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
+        ]
+        if not polys:
+            return None
+        merged = unary_union(polys)
+        return merged if not merged.is_empty else None
+    return None
+
+
+def _resolve_overlaps(
+    items: list[tuple[str, Polygon | MultiPolygon]],
+) -> list[tuple[str, Polygon | MultiPolygon]]:
+    """Make the postcode polygons a partition: no two cover the same ground.
+
+    Overlap appears at OA seams (the 5m merge buffer expands each postcode
+    independently), from simplifying each postcode on its own, and as genuine
+    containment (a postcode fully enclosed by another). Each postcode is trimmed
+    by the union of its higher-priority overlapping neighbours, where **priority =
+    ascending area**: a smaller postcode wins contested ground. That single rule
+    handles both cases correctly — an enclosed postcode is always smaller than its
+    container, so it keeps its area while the container gets a hole (a `overlaps`
+    query alone would miss containment entirely). Run last, on the final output
+    geometries, so nothing re-introduces overlap afterwards. A postcode that would
+    be emptied keeps its original geometry, so an active postcode is never dropped.
+    """
+    geoms = [g for _, g in items]
+    n = len(geoms)
+    if n < 2:
+        return items
+
+    # rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
+    # for determinism.
+    rank = {
+        idx: r
+        for r, idx in enumerate(
+            sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
+        )
+    }
+
+    tree = STRtree(geoms)
+    arr = np.array(geoms, dtype=object)
+    pairs: set[tuple[int, int]] = set()
+    # "overlaps" gives partial overlaps; "contains" gives containment (which
+    # "overlaps" excludes) — together they cover every 2-D overlap without the
+    # edge-touch explosion a plain "intersects" query would add.
+    for predicate in ("overlaps", "contains"):
+        qsrc, qtgt = tree.query(arr, predicate=predicate)
+        for s, t in zip(qsrc.tolist(), qtgt.tolist()):
+            if s != t:
+                pairs.add((s, t) if s < t else (t, s))
+
+    # For each loser (lower priority) the higher-priority neighbours to subtract.
+    higher: dict[int, list[int]] = defaultdict(list)
+    for a, b in pairs:
+        winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
+        higher[loser].append(winner)
+
+    out = list(geoms)
+    # Process losers from highest priority down, so every subtracted neighbour is
+    # already finalised.
+    for i in sorted(higher, key=lambda idx: rank[idx]):
+        cut = unary_union([out[j] for j in higher[i]])
+        trimmed = out[i].difference(cut)
+        if not trimmed.is_valid:
+            trimmed = make_valid(trimmed)
+        # Keep all polygonal parts: these geometries are in WGS84 degrees, so an
+        # area threshold here would wrongly drop everything but the largest part
+        # and re-open the very gaps the seam fix closed.
+        trimmed = _polygonal(trimmed)
+        if trimmed is not None and not trimmed.is_empty:
+            out[i] = trimmed
+    return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
+
+
+def _round_coords(coords, ndigits=6):
+    if coords and isinstance(coords[0], (int, float)):
+        return [round(coords[0], ndigits), round(coords[1], ndigits)]
+    return [_round_coords(c, ndigits) for c in coords]
+
+
+def _geojson_geometry(geom) -> dict | None:
+    """Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
+    geom = _polygonal(geom if geom.is_valid else make_valid(geom))
+    if geom is None or geom.is_empty:
+        return None
+    gj = mapping(geom)
+    return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
+
+
 def write_district_geojson(
    postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
 ) -> int:
-    """Group postcodes by district, write GeoJSON files. Returns file count."""
+    """Group postcodes by district, write GeoJSON files. Returns file count.
+
+    Before writing, the postcode polygons are converted to their final WGS84 form
+    and made a partition (overlaps removed) so the output never has two postcodes
+    covering the same ground.
+    """
    units_dir = output_dir / "units"
    tmp_units_dir = output_dir / "units.tmp"
    output_dir.mkdir(parents=True, exist_ok=True)
@ -166,38 +361,46 @@ def write_district_geojson(
        shutil.rmtree(tmp_units_dir)
    tmp_units_dir.mkdir(parents=True)

+    skipped: list[str] = []
+
+    # Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
+    # sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
+    # priority. to_wgs84_geojson_multi returns None only for a genuinely empty
+    # input, which is skipped and reported rather than aborting a multi-hour run.
+    converted: list[tuple[str, Polygon | MultiPolygon]] = []
+    for pc in sorted(postcodes):
+        gj = to_wgs84_geojson_multi(postcodes[pc])
+        if gj is None:
+            skipped.append(pc)
+            continue
+        converted.append((pc, shape(gj)))
+
+    # Remove overlap strips so the output is a clean partition.
+    converted = _resolve_overlaps(converted)
+
    by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
-    for pc, geom in postcodes.items():
+    for pc, geom in converted:
        parts = pc.split()
        district = parts[0] if parts else pc[:4]
        by_district[district].append((pc, geom))

    file_count = 0
-    seen_postcodes: set[str] = set()
    for district, entries in tqdm(
        sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
    ):
        features = []
        for pc, geom in sorted(entries, key=lambda x: x[0]):
-            if pc in seen_postcodes:
-                raise ValueError(f"Duplicate postcode boundary feature: {pc}")
-            seen_postcodes.add(pc)
-            geojson_geom = to_wgs84_geojson(geom)
+            geojson_geom = _geojson_geometry(geom)
            if geojson_geom is None:
-                raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
-            written_geom = shape(geojson_geom)
-            if written_geom.is_empty or not written_geom.is_valid:
-                raise ValueError(
-                    f"Invalid postcode boundary geometry after output: {pc}"
-                )
-            mapit_code = pc.replace(" ", "")
+                skipped.append(pc)
+                continue
            features.append(
                {
                    "type": "Feature",
                    "geometry": geojson_geom,
                    "properties": {
                        "postcodes": pc,
-                        "mapit_code": mapit_code,
+                        "mapit_code": pc.replace(" ", ""),
                    },
                }
            )
@ -211,6 +414,14 @@ def write_district_geojson(
            json.dump(collection, f, separators=(",", ":"))
        file_count += 1

+    if skipped:
+        preview = ", ".join(skipped[:10])
+        suffix = " …" if len(skipped) > 10 else ""
+        print(
+            f"  Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
+            f"geometry: {preview}{suffix}"
+        )
+
    if units_dir.exists():
        shutil.rmtree(units_dir)
    tmp_units_dir.replace(units_dir)
--- a/pipeline/transform/postcode_boundaries/process_oa.py
+++ b/pipeline/transform/postcode_boundaries/process_oa.py
@ -85,19 +85,42 @@ def _claim_inspire_parcels(
    uprn_pts = shp_points(points)
    pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")

-    # First priority: parcels that physically contain UPRNs. Majority vote
-    # resolves blocks of flats or overlapping parcel data.
+    # First priority: parcels that physically contain UPRNs. A parcel holding
+    # UPRNs from a single postcode goes wholly to that postcode. A parcel shared
+    # by several postcodes (a block of flats spanning postcodes, or overlapping
+    # parcel data) is split between them via a sub-Voronoi over their own UPRNs
+    # clipped to the parcel — so EVERY contained postcode keeps part of the
+    # parcel. A bare majority vote would hand the whole parcel to one winner and
+    # leave the losers' UPRNs trapped inside claimed land, dropping them from
+    # both this claim and the `remaining` polygon handed to Voronoi downstream.
    cand_postcodes: dict[int, list[str]] = defaultdict(list)
+    cand_point_idx: dict[int, list[int]] = defaultdict(list)
    for pi, ci in zip(pt_idx, cand_idx):
        cand_postcodes[ci].append(postcodes[pi])
+        cand_point_idx[ci].append(pi)

+    points_f64 = points.astype(np.float64, copy=False)
    contained_parts: dict[str, list] = defaultdict(list)
    contained_scores: Counter[str] = Counter()
    for ci, pc_list in cand_postcodes.items():
        pc_counts = Counter(pc_list)
-        winner, votes = pc_counts.most_common(1)[0]
-        contained_parts[winner].append(parcels[ci])
-        contained_scores[winner] += votes
+        if len(pc_counts) == 1:
+            winner = next(iter(pc_counts))
+            contained_parts[winner].append(parcels[ci])
+            contained_scores[winner] += pc_counts[winner]
+            continue
+        # Shared parcel: sub-Voronoi over the contained UPRNs so each postcode
+        # present keeps a fragment instead of being absorbed by the winner.
+        sub_idx = cand_point_idx[ci]
+        sub_points = points_f64[sub_idx]
+        sub_postcodes = [postcodes[pi] for pi in sub_idx]
+        for pc, geom in compute_voronoi_regions(
+            sub_points, sub_postcodes, parcels[ci]
+        ).items():
+            cleaned = _clean_polygonal(geom)
+            if cleaned is not None:
+                contained_parts[pc].append(cleaned)
+                contained_scores[pc] += pc_counts[pc]

    contained_claimed = _merge_parts_by_postcode(contained_parts)
    contained_claims = sorted(
@ -109,7 +132,6 @@ def _claim_inspire_parcels(
    # each to the nearest UPRN/postcode so parcel boundaries carry more of the
    # visible postcode shape; Voronoi is then limited to roads, parks, water, and
    # any other non-parcel gaps.
-    points_f64 = points.astype(np.float64, copy=False)
    contained_union = _union_claims(contained_claims)
    nearest_tree = cKDTree(points_f64)
    nearest_parts: dict[str, list] = defaultdict(list)
@ -235,11 +257,11 @@ def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
            return None
        if len(polys) == 1:
            return polys[0]
-        return MultiPolygon(
-            [
-                p
-                for g in polys
-                for p in (g.geoms if g.geom_type == "MultiPolygon" else [g])
-            ]
-        )
+        # Union (not bare MultiPolygon construction): make_valid can emit
+        # overlapping polygonal parts, and a MultiPolygon of overlapping parts is
+        # invalid — it double-counts area and makes the next `.difference()` raise
+        # a TopologyException that aborts the OA (and, in parallel mode, the
+        # worker). unary_union merges them into a valid geometry.
+        merged = unary_union(polys)
+        return merged if not merged.is_empty else None
    return None
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -11,12 +11,20 @@ import pytest
 from shapely.geometry import MultiPolygon, Polygon, box
 from shapely.ops import unary_union

+from .fragments_cache import (
+    fragments_cache_is_fresh,
+    load_fragments,
+    save_fragments,
+)
+from .__main__ import _oa_fragments, _process_oas
+from .inspire import build_inspire_index
 from .oa_boundaries import parse_gpkg_geometry
 from .greenspace import subtract_greenspace
 from .output import (
    _fill_holes,
    merge_fragments,
    to_wgs84_geojson,
+    to_wgs84_geojson_multi,
    write_district_geojson,
 )
 from .process_oa import _extract_polygonal, process_oa
@ -173,6 +181,52 @@ class TestWhitespacePostcodes:

        assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]

+    def test_remapped_terminated_postcode_adopts_successor_oa(self, tmp_path):
+        """When a terminated postcode is remapped to its active successor, the
+        remapped seed point must carry the SUCCESSOR's OA (and coords), not the
+        terminated postcode's original OA. Pre-fix the row kept OA21CD of the
+        terminated postcode, seeding the successor into an OA it doesn't belong
+        to and splitting its boundary across OAs."""
+        # Terminated AA1 1AA sits in OA E00000001. Its nearest active successor
+        # AA1 1AB lives in a DIFFERENT OA (E00000002) far away.
+        uprns = pl.DataFrame(
+            {
+                "GRIDGB1E": [500010],
+                "GRIDGB1N": [180010],
+                "PCDS": ["AA1 1AA"],
+                "OA21CD": ["E00000001"],
+            }
+        )
+        uprn_path = tmp_path / "uprn.parquet"
+        uprns.write_parquet(uprn_path)
+        arcgis = pl.DataFrame(
+            {
+                "pcds": ["AA1 1AA", "AA1 1AB"],
+                "east1m": [500010, 500030],
+                "north1m": [180010, 180020],
+                # AA1 1AA terminated → only AA1 1AB is an active successor, and
+                # it belongs to a different OA than the terminated postcode.
+                "oa21cd": ["E00000001", "E00000002"],
+                "doterm": ["2020-01-01", None],
+                "ctry25cd": ["E92000001", "E92000001"],
+            }
+        )
+        arcgis_path = tmp_path / "arcgis.parquet"
+        arcgis.write_parquet(arcgis_path)
+
+        loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
+
+        # The remapped point must be grouped under the successor's OA, not the
+        # terminated postcode's OA.
+        assert "E00000002" in offsets, "Successor OA missing — remap kept old OA"
+        assert "E00000001" not in offsets, (
+            "Remapped point still lives in the terminated postcode's OA"
+        )
+        points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
+        assert postcodes == ["AA1 1AB"]
+        # It should also adopt the successor's authoritative coordinates.
+        assert points.tolist() == [[500030.0, 180020.0]]
+
    def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
        uprns = pl.DataFrame(
            {
@ -617,6 +671,32 @@ class TestProcessOAInspireParcelAssignment:
        for _, geom in fragments:
            assert geom.difference(oa_geom).area < 0.01

+    def test_shared_parcel_keeps_every_contained_postcode(self):
+        """A single parcel containing UPRNs for [A, A, B] must yield a fragment
+        for BOTH A and B. Pre-fix the majority winner (A) claimed the whole
+        parcel, excluding it from `remaining`, so B's UPRNs were trapped inside
+        claimed land and B vanished entirely (no fragment)."""
+        oa_geom = box(0, 0, 100, 100)
+        parcel = box(0, 0, 100, 100)  # one parcel covering the whole OA
+        points = np.array(
+            [
+                [20, 50],  # postcode A
+                [30, 50],  # postcode A (majority)
+                [80, 50],  # postcode B (minority — would be dropped pre-fix)
+            ]
+        )
+        postcodes = ["A", "A", "B"]
+
+        fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
+        frag_dict = dict(fragments)
+
+        assert "A" in frag_dict, "Majority postcode A must keep a fragment"
+        assert "B" in frag_dict, "Minority postcode B must not be dropped"
+        assert frag_dict["A"].area > 0
+        assert frag_dict["B"].area > 0
+        # The split must partition the parcel without overlap.
+        assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
+

 # ---------------------------------------------------------------------------
 # _extract_polygonal helper
@ -656,6 +736,21 @@ class TestExtractPolygonal:

        assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None

+    def test_overlapping_collection_unioned_to_valid(self):
+        """A GeometryCollection with OVERLAPPING polygons must be unioned into a
+        VALID geometry (not a raw MultiPolygon, which would be invalid and crash
+        the next .difference()), and must not double-count the overlap area."""
+        from shapely.geometry import GeometryCollection
+
+        a = box(0, 0, 100, 100)
+        b = box(50, 50, 150, 150)  # overlaps a by 50x50
+        result = _extract_polygonal(GeometryCollection([a, b]))
+        assert result is not None
+        assert result.is_valid
+        assert result.area == pytest.approx(unary_union([a, b]).area)
+        # And the formerly-crashing op now works:
+        assert result.difference(box(0, 0, 10, 10)).is_valid
+

 # ---------------------------------------------------------------------------
 # Edge case: merge_fragments handles single-OA postcodes
@ -763,12 +858,12 @@ class TestParseGpkgGeometry:


 class TestFillHoles:
-    """_fill_holes must remove all interior holes from polygons."""
+    """_fill_holes fills small artifact holes but keeps large (real-enclosed) ones."""

-    def test_polygon_with_hole(self):
-        """A polygon with an interior ring should become a solid polygon."""
+    def test_small_artifact_hole_filled(self):
+        """A small (<1000 m²) interior ring is an artifact and gets filled."""
        outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
-        hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
+        hole = [(40, 40), (60, 40), (60, 60), (40, 60), (40, 40)]  # 20x20 = 400 m²
        poly_with_hole = Polygon(outer, [hole])
        assert len(list(poly_with_hole.interiors)) == 1
        result = _fill_holes(poly_with_hole)
@ -776,6 +871,15 @@ class TestFillHoles:
        assert len(list(result.interiors)) == 0
        assert result.area == pytest.approx(Polygon(outer).area)

+    def test_large_hole_kept(self):
+        """A large (>=1000 m²) hole is likely a real enclosed postcode — keep it."""
+        outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
+        hole = [(20, 20), (80, 20), (80, 80), (20, 80), (20, 20)]  # 60x60 = 3600 m²
+        poly_with_hole = Polygon(outer, [hole])
+        result = _fill_holes(poly_with_hole)
+        assert len(list(result.interiors)) == 1
+        assert result.area == pytest.approx(10000 - 3600)
+
    def test_multipolygon_with_holes(self):
        """A MultiPolygon where each part has holes should have all holes removed."""
        outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
@ -944,3 +1048,356 @@ class TestGreenspaceHolePreserved:
        merged = result["TEST1"]
        assert len(list(merged.interiors)) == 1
        assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
+
+
+# ---------------------------------------------------------------------------
+# merge_fragments keeps substantial detached parts (no OA-seam coverage gaps)
+# ---------------------------------------------------------------------------
+
+
+class TestKeepDetachedParts:
+    """A postcode split across an OA seam (railway/river) must keep both parts
+    instead of dropping all but the largest, which left ~1.8% uncovered gaps."""
+
+    def test_far_apart_parts_both_kept(self):
+        # Two 50x50m blocks 30m apart — wider than the 10m merge buffer.
+        a = box(0, 0, 50, 50)  # 2500 m²
+        b = box(80, 0, 130, 50)  # 2500 m², 30m gap
+        geom = merge_fragments([("AA1 1AA", a), ("AA1 1AA", b)])["AA1 1AA"]
+        assert geom.geom_type == "MultiPolygon"
+        assert len(geom.geoms) == 2
+        assert geom.area == pytest.approx(5000, rel=0.01)
+
+    def test_tiny_noise_part_dropped(self):
+        main = box(0, 0, 100, 100)  # 10000 m²
+        noise = box(200, 200, 205, 205)  # 25 m² < 100 m² threshold
+        geom = merge_fragments([("AA1 1AA", main), ("AA1 1AA", noise)])["AA1 1AA"]
+        assert geom.geom_type == "Polygon"
+        assert geom.area == pytest.approx(10000, rel=0.01)
+
+
+class TestMultiPolygonOutput:
+    """to_wgs84_geojson_multi / the writer must emit MultiPolygon for split
+    postcodes (the Rust server + loader already parse MultiPolygon)."""
+
+    def test_multipolygon_preserves_all_parts(self):
+        from shapely.geometry import shape
+
+        mp = MultiPolygon(
+            [
+                box(530000, 180000, 530100, 180100),
+                box(531000, 180000, 531100, 180100),
+            ]
+        )
+        gj = to_wgs84_geojson_multi(mp)
+        assert gj["type"] == "MultiPolygon"
+        assert len(gj["coordinates"]) == 2
+        rt = shape(gj)
+        assert rt.is_valid and not rt.is_empty
+        assert len(rt.geoms) == 2
+
+    def test_single_part_stays_polygon(self):
+        gj = to_wgs84_geojson_multi(box(530000, 180000, 530100, 180100))
+        assert gj["type"] == "Polygon"
+
+    def test_writer_emits_multipolygon_feature(self, tmp_path):
+        mp = MultiPolygon(
+            [
+                box(530000, 180000, 530100, 180100),
+                box(531000, 180000, 531100, 180100),
+            ]
+        )
+        assert write_district_geojson({"AA1 1AA": mp}, tmp_path) == 1
+        coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        assert coll["features"][0]["geometry"]["type"] == "MultiPolygon"
+
+
+class TestOutputPartition:
+    """The writer must emit a partition: overlapping postcodes are made disjoint
+    (no two cover the same ground) without dropping an active postcode."""
+
+    def test_overlapping_postcodes_made_disjoint(self, tmp_path):
+        from shapely.geometry import shape
+
+        a = box(530000, 180000, 530100, 180100)
+        b = box(530090, 180000, 530200, 180100)  # overlaps `a` in a 10m strip
+        assert a.intersection(b).area > 0  # precondition: they overlap
+
+        write_district_geojson({"AA1 1AA": a, "AA1 1AB": b}, tmp_path)
+        coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        geoms = {
+            f["properties"]["postcodes"]: shape(f["geometry"])
+            for f in coll["features"]
+        }
+        assert set(geoms) == {"AA1 1AA", "AA1 1AB"}  # neither dropped
+        # Disjoint interiors (share at most an edge).
+        assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
+            0.0, abs=1e-12
+        )
+        assert all(g.area > 0 for g in geoms.values())
+
+    def test_enclosed_postcode_makes_container_a_donut(self, tmp_path):
+        """A postcode fully INSIDE another must stay disjoint: the smaller (inner)
+        keeps its area, the container gets a hole. A plain `overlaps` query misses
+        containment, so this is the regression guard for that fix."""
+        from shapely.geometry import shape
+
+        outer = box(530000, 180000, 530300, 180300)  # 90,000 m²
+        inner = box(530100, 180100, 530200, 180200)  # 10,000 m², fully inside outer
+        assert outer.contains(inner)  # precondition
+
+        write_district_geojson({"AA1 1AA": outer, "AA1 1AB": inner}, tmp_path)
+        coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        geoms = {
+            f["properties"]["postcodes"]: shape(f["geometry"])
+            for f in coll["features"]
+        }
+        assert set(geoms) == {"AA1 1AA", "AA1 1AB"}  # neither dropped
+        assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
+            0.0, abs=1e-12
+        )
+        # Container is now a donut around the enclosed postcode.
+        assert geoms["AA1 1AA"].geom_type == "Polygon"
+        assert len(list(geoms["AA1 1AA"].interiors)) == 1
+        assert geoms["AA1 1AB"].area > 0
+
+
+# ---------------------------------------------------------------------------
+# InspireIndex must return the same candidates as a brute-force bbox scan
+# ---------------------------------------------------------------------------
+
+
+class TestInspireIndex:
+    """The grid index replaces a per-OA linear scan of all parcel bboxes; it must
+    return an identical candidate set (and order) so Phase 3 output is unchanged."""
+
+    @staticmethod
+    def _brute(bboxes, box):
+        e0, n0, e1, n1 = box
+        mask = (
+            (bboxes[:, 2] >= e0)
+            & (bboxes[:, 0] <= e1)
+            & (bboxes[:, 3] >= n0)
+            & (bboxes[:, 1] <= n1)
+        )
+        return np.where(mask)[0]
+
+    def test_matches_brute_force_over_random_queries(self):
+        rng = np.random.default_rng(0)
+        x = rng.uniform(0, 10000, 5000)
+        y = rng.uniform(0, 10000, 5000)
+        w = rng.uniform(1, 60, 5000)  # all <= 500m cell → CSR path
+        h = rng.uniform(1, 60, 5000)
+        bboxes = np.column_stack([x, y, x + w, y + h]).astype(np.float64)
+        idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
+
+        for _ in range(400):
+            cx, cy = rng.uniform(0, 10000), rng.uniform(0, 10000)
+            sz = float(rng.choice([30.0, 200.0, 1000.0, 3000.0]))
+            box = (cx, cy, cx + sz, cy + sz)
+            got = idx.candidate_indices(box)
+            expected = np.sort(self._brute(bboxes, box))
+            assert np.array_equal(got, expected)
+
+    def test_oversized_parcel_is_found(self):
+        # A parcel larger than a cell goes to the overflow list, not the grid;
+        # a query deep inside it (away from the small parcels) must still find it.
+        bboxes = np.array(
+            [
+                [0.0, 0.0, 5000.0, 5000.0],  # 5km parcel >> 500m cell
+                [100.0, 100.0, 120.0, 120.0],
+                [4000.0, 4000.0, 4020.0, 4020.0],
+            ]
+        )
+        idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
+        box = (2000.0, 2000.0, 2050.0, 2050.0)
+        got = idx.candidate_indices(box)
+        assert 0 in got
+        assert np.array_equal(got, np.sort(self._brute(bboxes, box)))
+
+    def test_no_overlap_returns_empty(self):
+        bboxes = np.array([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]])
+        idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
+        assert len(idx.candidate_indices((100.0, 100.0, 110.0, 110.0))) == 0
+
+
+# ---------------------------------------------------------------------------
+# Parallel OA processing must match the sequential result exactly
+# ---------------------------------------------------------------------------
+
+
+class TestParallelProcessing:
+    """_process_oas across workers must produce the same fragments as workers=1.
+    Uses single-postcode OAs (fast path), so it exercises the chunking + WKB
+    round-trip + fork machinery without needing INSPIRE data."""
+
+    @staticmethod
+    def _inputs(n_oas=60):
+        import pyarrow as pa
+
+        oa_geoms = {
+            f"E{i:08d}": box(i * 100.0, 0.0, i * 100.0 + 50.0, 50.0)
+            for i in range(n_oas)
+        }
+        codes = sorted(oa_geoms)
+        east, north, pcs = [], [], []
+        offsets = {}
+        pos = 0
+        for i, code in enumerate(codes):
+            east += [i * 100.0 + 10.0, i * 100.0 + 20.0]
+            north += [10.0, 20.0]
+            pcs += [f"AA{i % 5} {i % 9}AA"] * 2  # one postcode per OA → fast path
+            offsets[code] = (pos, pos + 2)
+            pos += 2
+        return (
+            codes,
+            oa_geoms,
+            np.array(east),
+            np.array(north),
+            pa.array(pcs, type=pa.large_string()),
+            offsets,
+        )
+
+    @staticmethod
+    def _norm(frags):
+        return sorted((pc, geom.wkb_hex) for pc, geom in frags)
+
+    def test_parallel_matches_sequential(self):
+        codes, oa, east, north, pcs, offs = self._inputs()
+        seq, s1 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=1)
+        par, s2 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=3)
+        assert len(seq) == len(codes)  # one fragment per single-postcode OA
+        assert s1 == s2 == len(codes)
+        assert self._norm(seq) == self._norm(par)
+
+    def test_oa_failure_is_tagged_with_oa_code(self):
+        """A failure inside per-OA processing must re-raise with the OA code, so a
+        single bad OA is attributable instead of an anonymous worker abort."""
+        # Missing OA in the geoms dict → KeyError, wrapped with the OA code.
+        with pytest.raises(RuntimeError, match="E00099999"):
+            _oa_fragments("E00099999", {}, None, None, None, {}, None)
+
+
+class TestDegenerateGeometryHandling:
+    """Every active postcode must keep a boundary (validate_outputs is strict),
+    so a sub-grid sliver is fattened rather than dropped. A genuinely empty
+    geometry is skipped without aborting the whole write (the 10h regression)."""
+
+    # Three near-collinear vertices in BNG: bbox ~28m x 7m but area ~0.04 m²,
+    # i.e. AL10 0TU. Without the rescue it snaps to empty at output precision.
+    SLIVER = Polygon(
+        [(523045.34, 209625.56), (523040.47, 209624.33), (523017.0, 209618.42)]
+    )
+
+    def test_sliver_is_rescued_to_valid_geometry(self):
+        from shapely.geometry import shape
+
+        result = to_wgs84_geojson(self.SLIVER)
+        assert result is not None, "sliver must be rescued, not dropped"
+        rt = shape(result)
+        assert not rt.is_empty
+        assert rt.is_valid
+
+    def test_collinear_zero_area_input_is_rescued(self):
+        """A zero-area collinear 'polygon' (can't be cleaned to a polygon) must
+        still be rescued via the representative-point fallback, not dropped."""
+        from shapely.geometry import shape
+
+        degenerate = Polygon(
+            [(523000, 209600), (523010, 209600), (523020, 209600), (523000, 209600)]
+        )
+        assert degenerate.area == 0.0
+        result = to_wgs84_geojson(degenerate)
+        assert result is not None, "degenerate input must be rescued, not dropped"
+        rt = shape(result)
+        assert not rt.is_empty
+        assert rt.is_valid
+
+    def test_sliver_postcode_present_in_output(self, tmp_path):
+        postcodes = {
+            "AA1 1AA": box(530000, 180000, 530100, 180100),
+            "AA1 1AB": self.SLIVER,  # must survive
+        }
+        file_count = write_district_geojson(postcodes, tmp_path)
+        assert file_count == 1
+        collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        written = {f["properties"]["postcodes"] for f in collection["features"]}
+        assert written == {"AA1 1AA", "AA1 1AB"}
+
+    def test_empty_geometry_skipped_not_raised(self, tmp_path):
+        # The last-resort safety net: an unrescuable (empty) geometry is skipped
+        # so one bad postcode can never abort a multi-hour run.
+        postcodes = {
+            "AA1 1AA": box(530000, 180000, 530100, 180100),
+            "AA1 1AB": Polygon(),  # genuinely empty
+        }
+        file_count = write_district_geojson(postcodes, tmp_path)
+        assert file_count == 1
+        collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        written = {f["properties"]["postcodes"] for f in collection["features"]}
+        assert written == {"AA1 1AA"}
+
+
+# ---------------------------------------------------------------------------
+# fragments_cache round-trips Phase 3 output and validates freshness
+# ---------------------------------------------------------------------------
+
+
+class TestFragmentsCache:
+    """Persisting Phase 3 lets a crashed run resume without the ~10h OA loop."""
+
+    def test_round_trip_preserves_postcodes_and_geometry(self, tmp_path):
+        fragments = [
+            ("AA1 1AA", box(0, 0, 100, 100)),
+            ("AA1 1AB", box(200, 200, 250, 260)),
+            # A postcode spanning multiple OAs appears as repeated entries.
+            ("AA1 1AA", box(100, 0, 150, 100)),
+            ("AA1 1AC", MultiPolygon([box(0, 0, 10, 10), box(20, 20, 30, 30)])),
+        ]
+        cache = tmp_path / "fragments_cache.parquet"
+        save_fragments(cache, fragments)
+        loaded = load_fragments(cache)
+
+        assert [pc for pc, _ in loaded] == [pc for pc, _ in fragments]
+        for (_, original), (_, restored) in zip(fragments, loaded):
+            assert restored.equals(original)
+
+    def test_save_is_atomic_no_tmp_left_behind(self, tmp_path):
+        cache = tmp_path / "fragments_cache.parquet"
+        save_fragments(cache, [("AA1 1AA", box(0, 0, 1, 1))])
+        assert cache.exists()
+        assert not (tmp_path / "fragments_cache.parquet.tmp").exists()
+
+    def test_missing_cache_is_not_fresh(self, tmp_path):
+        cache = tmp_path / "fragments_cache.parquet"
+        inp = tmp_path / "uprn.parquet"
+        inp.write_text("x")
+        assert fragments_cache_is_fresh(cache, [inp]) is False
+
+    def test_cache_newer_than_inputs_is_fresh(self, tmp_path):
+        import os
+
+        inp = tmp_path / "uprn.parquet"
+        inp.write_text("x")
+        cache = tmp_path / "fragments_cache.parquet"
+        cache.write_text("c")
+        os.utime(inp, (1_000, 1_000))
+        os.utime(cache, (2_000, 2_000))
+        assert fragments_cache_is_fresh(cache, [inp, None]) is True
+
+    def test_cache_older_than_any_input_is_stale(self, tmp_path):
+        import os
+
+        inp = tmp_path / "oa.gpkg"
+        inp.write_text("x")
+        cache = tmp_path / "fragments_cache.parquet"
+        cache.write_text("c")
+        os.utime(cache, (1_000, 1_000))
+        os.utime(inp, (2_000, 2_000))  # input touched after the cache
+        assert fragments_cache_is_fresh(cache, [inp]) is False
+
+    def test_missing_input_is_ignored(self, tmp_path):
+        cache = tmp_path / "fragments_cache.parquet"
+        cache.write_text("c")
+        # arcgis is optional/absent — it cannot have invalidated the cache.
+        assert fragments_cache_is_fresh(cache, [tmp_path / "absent.parquet"]) is True
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -79,13 +79,42 @@ def load_uprns(
    )

    if mapping is not None and mapping.height > 0:
-        uprns = (
-            uprns.join(
-                mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
+        # Remap terminated postcodes to their nearest active successor. The
+        # successor generally lives in a DIFFERENT OA (and at different grid
+        # coordinates), so the remapped point must adopt the successor's
+        # authoritative OA/coords — keeping the terminated postcode's original
+        # OA would seed the successor into an OA it doesn't belong to, splitting
+        # its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
+        # own OA, since a live postcode can legitimately span several OAs.
+        uprns = uprns.join(
+            mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
+        ).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
+        if active_postcode_points is not None:
+            successor_oa = active_postcode_points.rename(
+                {
+                    "PCDS": "new_postcode",
+                    "GRIDGB1E": "_succ_e",
+                    "GRIDGB1N": "_succ_n",
+                    "OA21CD": "_succ_oa",
+                }
            )
-            .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
-            .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
-        )
+            uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
+                pl.when("_remapped")
+                .then(pl.col("_succ_e"))
+                .otherwise(pl.col("GRIDGB1E"))
+                .alias("GRIDGB1E"),
+                pl.when("_remapped")
+                .then(pl.col("_succ_n"))
+                .otherwise(pl.col("GRIDGB1N"))
+                .alias("GRIDGB1N"),
+                pl.when("_remapped")
+                .then(pl.col("_succ_oa"))
+                .otherwise(pl.col("OA21CD"))
+                .alias("OA21CD"),
+            )
+        uprns = uprns.with_columns(
+            pl.coalesce("new_postcode", "PCDS").alias("PCDS")
+        ).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")

    if active_postcode_points is not None:
        active_postcodes = active_postcode_points.select("PCDS").unique()
@ -149,3 +178,37 @@ def get_oa_uprns(
    )
    postcodes = sub["PCDS"].to_list()
    return points, postcodes
+
+
+def extract_uprn_arrays(df: pl.DataFrame):
+    """Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
+
+    Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
+    pyarrow string Array. Multiprocessing workers slice these per OA via
+    :func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
+    fork-after-threads deadlock hazard of polars' rayon pool. Being plain
+    numpy/Arrow buffers (not millions of Python objects), they are shared by
+    ``fork`` copy-on-write rather than duplicated ~1GB per worker.
+    """
+    import pyarrow as pa
+
+    east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
+    north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
+    postcodes = df["PCDS"].to_arrow()
+    if isinstance(postcodes, pa.ChunkedArray):
+        postcodes = postcodes.combine_chunks()
+    return east, north, postcodes
+
+
+def get_oa_uprns_arrays(
+    east: np.ndarray,
+    north: np.ndarray,
+    postcodes,
+    offsets: dict[str, tuple[int, int]],
+    oa_code: str,
+) -> tuple[np.ndarray, list[str]]:
+    """Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
+    :func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
+    s, e = offsets[oa_code]
+    points = np.column_stack([east[s:e], north[s:e]])
+    return points, postcodes.slice(s, e - s).to_pylist()
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -11,9 +11,9 @@ from pathlib import Path
 import numpy as np
 import polars as pl

+from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
 from pipeline.transform.price_estimation.index import build_index
 from pipeline.transform.price_estimation.knn import (
-    KNN_BLEND_WEIGHT,
    build_knn_pool,
    knn_median_psm,
 )
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
            .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
            .exp()
        )
-        .fill_null(pl.col("input_price").cast(pl.Float64))
+        # Keep null when the index can't be interpolated, matching production
+        # (estimate.py ships null there). compute_metrics filters to finite
+        # positive predictions, so these rows correctly drop from the Index n
+        # rather than silently degrading to the Naive prediction.
        .alias("predicted"),
    )
    return test
@ -265,13 +268,12 @@ def main():
        f"  kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
    )

-    # Blend: (1-w)*index + w*kNN where both available
+    # Blend with the exact shipped estimator (stability gate + last-price cap +
+    # null-when-no-index) so the "Blended" stage reflects production accuracy.
+    # input_price is the backtest equivalent of production's "Last known price".
    index_est = test["predicted"].to_numpy().astype(np.float64)
-    knn_valid = np.isfinite(knn_est) & (knn_est > 0)
-    blended = np.where(
-        knn_valid & np.isfinite(index_est),
-        (1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
-        np.where(np.isfinite(index_est), index_est, knn_est),
+    blended = guarded_blend_estimates(
+        index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
    )

    actual = test["actual_price"].to_numpy().astype(np.float64)
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -19,6 +19,8 @@ from tqdm import tqdm
 from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
+    reanchor_dict,
+    reanchor_dicts,
    shrink_dicts,
    spatial_smooth,
 )
@ -431,6 +433,17 @@ def build_index(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

+        # Re-anchor every repeat-sales dict to the global base year before any
+        # shrinkage/smoothing/blending. solve_robust_index anchors each cell to
+        # log-index 0 at its OWN earliest year, so cells with shorter histories
+        # are measured from a later origin; combining them key-by-key would
+        # otherwise average level-incompatible numbers. The hedonic fallback is
+        # already anchored at min_year, so we align everything to min_year.
+        national_idx = reanchor_dict(national_idx, min_year)
+        area_idx = reanchor_dicts(area_idx, min_year)
+        district_idx = reanchor_dicts(district_idx, min_year)
+        sector_idx = reanchor_dicts(sector_idx, min_year)
+
        # Shrinkage: national -> hedonic first, then hierarchical
        print("  Applying shrinkage...")
        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
 SPATIAL_BLEND_K = 30


+def _base_value(index: dict[int, float], base_year: int) -> float:
+    """Value of an index dict at `base_year`, with forward/back-fill for gaps.
+
+    Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
+    values are log-levels relative to that origin. To express it on a common
+    origin we need its value at the shared `base_year`:
+      - exact hit: use it directly;
+      - base_year before the dict's history: back-fill, i.e. the earliest known
+        value (which is 0.0 by construction). We cannot observe the level move
+        between the global base and a later-starting cell, so we assume none,
+        matching forward_fill's back-fill convention;
+      - base_year inside a gap / after history: forward-fill the most recent
+        prior value.
+    """
+    if base_year in index:
+        return index[base_year]
+    years = sorted(index)
+    if not years or base_year < years[0]:
+        return index[years[0]] if years else 0.0
+    prior = [y for y in years if y <= base_year]
+    return index[prior[-1]]
+
+
+def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
+    """Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
+
+    Subtracting the same constant from every year preserves all within-dict
+    year-to-year differences, so estimate.py's (current - sale) semantics are
+    unchanged; it only fixes the cross-dict level mismatch before blending.
+    """
+    if not index:
+        return index
+    shift = _base_value(index, base_year)
+    if shift == 0.0:
+        return index
+    return {y: v - shift for y, v in index.items()}
+
+
+def reanchor_dicts(
+    indices: dict[str, dict[int, float]], base_year: int
+) -> dict[str, dict[int, float]]:
+    """Re-anchor every index dict in a mapping to the common `base_year`."""
+    return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
+
+
 def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
    """Shrink dict values toward parent using n/(n+k) weighting.

--- a/pipeline/transform/test_crime.py
+++ b/pipeline/transform/test_crime.py
@ -158,6 +158,53 @@ def test_transform_crime_writes_by_year_output(tmp_path):
    assert serious[2024] == 12.0


+def test_transform_crime_headline_is_mean_of_per_year_bars(tmp_path):
+    """The avg/yr headline must equal the average of the by-year chart bars, i.e.
+    the simple mean of each year's annualised count -- NOT a month-weighted pooled
+    rate. They diverge when years have uneven partial-month coverage."""
+    crime_dir = tmp_path / "crime"
+    jan23 = crime_dir / "2023-01"
+    jan24 = crime_dir / "2024-01"
+    feb24 = crime_dir / "2024-02"
+    for d in (jan23, jan24, feb24):
+        d.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    # 2023: 6 burglaries in 1 month -> 6 * 12 / 1 = 72/yr.
+    (jan23 / "2023-01-test-force-street.csv").write_text(
+        "\n".join(
+            [header]
+            + [
+                f"{i},2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"
+                for i in range(1, 7)
+            ]
+        )
+        + "\n"
+    )
+    # 2024: 2 burglaries across 2 months -> 2 * 12 / 2 = 12/yr.
+    (jan24 / "2024-01-test-force-street.csv").write_text(
+        "\n".join([header, "7,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
+    )
+    (feb24 / "2024-02-test-force-street.csv").write_text(
+        "\n".join([header, "8,2024-02,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
+    )
+
+    output = tmp_path / "crime.parquet"
+    by_year_output = tmp_path / "crime_by_year.parquet"
+    transform_crime(crime_dir, output, by_year_output)
+
+    # Mean of per-year bars = (72 + 12) / 2 = 42.0.
+    # The old pooled rate (8 incidents / 3 months * 12 = 32.0) would be wrong.
+    avg = pl.read_parquet(output).to_dicts()[0]
+    assert avg["Burglary (avg/yr)"] == 42.0
+
+    by_year = pl.read_parquet(by_year_output).row(0, named=True)
+    burglary = {p["year"]: p["count"] for p in by_year["Burglary (by year)"]}
+    assert burglary == {2023: 72.0, 2024: 12.0}
+    # Headline equals the mean of the bars it summarises.
+    assert avg["Burglary (avg/yr)"] == sum(burglary.values()) / len(burglary)
+
+
 def test_transform_crime_fails_without_valid_months(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -252,6 +252,63 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
    assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}


+def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
+    # P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
+    # (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
+    # distinct years across all postcodes, but only ONE year for P. The headline
+    # must divide by P's own years-present (1), equalling its single by-year bar
+    # (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
+    # The two squares are equal-area, so area normalisation leaves counts as-is.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units,
+        {
+            "AB1": [
+                _square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
+                _square_feature("AB1 1AB", 5000, 5000, 5010, 5010),
+            ]
+        },
+    )
+
+    crime = tmp_path / "crime"
+    # P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
+    _write_month(
+        crime,
+        "2024-01",
+        [
+            _crime_row("2024-01", 1005, 1005, "Burglary"),
+            _crime_row("2024-01", 1005, 1005, "Burglary"),
+        ],
+    )
+    # Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
+    # two years without adding any incident to P.
+    _write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])
+
+    output = tmp_path / "crime_by_postcode.parquet"
+    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
+    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+
+    rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
+    by_year_rows = {
+        r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
+    }
+
+    # P's headline equals the simple mean of its own bars (just the 2024 bar).
+    p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
+    assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
+    # Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
+    # across all postcodes) would have deflated this to 12.0.
+    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
+    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
+        sum(p_bars.values()) / len(p_bars), abs=0.05
+    )
+
+    # Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
+    q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
+    assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
+    assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+
+
 def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
    units = tmp_path / "units"
    _write_boundaries(
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -149,6 +149,7 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
+            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

@ -201,6 +202,7 @@ def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
+            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

@ -235,6 +237,7 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
            "town_city": ["Exampletown"],
            "duration": ["F"],
            "old_new": ["N"],
+            "ppd_category": ["A"],
        }
    ).write_parquet(price_paid_path)

@ -259,6 +262,93 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
    ]


+def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
+    # Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
+    # pollute latest_price / historical_prices, but the property still survives
+    # via its standard Category A sales.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [200_000, 250_000, 5_000_000],
+            "date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
+            "property_type": ["T", "T", "T"],
+            "postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
+            "paon": ["1", "1", "1"],
+            "saon": [None, None, None],
+            "street": ["Example Street", "Example Street", "Example Street"],
+            "locality": [None, None, None],
+            "town_city": ["Exampletown", "Exampletown", "Exampletown"],
+            "duration": ["F", "F", "F"],
+            "old_new": ["N", "N", "N"],
+            # The latest (5M) sale is a Category B bulk/portfolio transfer.
+            "ppd_category": ["A", "A", "B"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # Only the two Category A sales survive; the 5M Category B transfer is dropped.
+    assert df.get_column("latest_price").to_list() == [250_000]
+    assert df.get_column("historical_prices").list.len().to_list() == [2]
+
+
+def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
+    # A new-build whose earliest sale is below MIN_PRICE must still take that early
+    # year as its EXACT construction date, while latest_price uses only the
+    # quality-passing (>=MIN_PRICE) sale.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [30_000, 300_000],
+            "date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
+            "property_type": ["T", "T"],
+            "postcode": ["AA1 1AA", "AA1 1AA"],
+            "paon": ["1", "1"],
+            "saon": [None, None],
+            "street": ["Example Street", "Example Street"],
+            "locality": [None, None],
+            "town_city": ["Exampletown", "Exampletown"],
+            "duration": ["F", "F"],
+            "old_new": ["Y", "Y"],
+            "ppd_category": ["A", "A"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # Construction year is the genuine earliest transfer (2015), flagged EXACT,
+    # even though that sale is below MIN_PRICE.
+    assert df.get_column("construction_age_band").to_list() == [2015]
+    assert df.get_column("is_construction_date_approximate").to_list() == [0]
+    # latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
+    assert df.get_column("latest_price").to_list() == [300_000]
+    assert df.get_column("historical_prices").list.len().to_list() == [1]
+
+
 def test_epc_band_to_year_uses_midpoint_and_clamps():
    import polars as pl

--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -13,7 +13,9 @@ from pipeline.transform.merge import (
    _active_english_postcode_area,
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
+    _coalesce_direct_epc_columns,
    _filter_to_active_english_postcodes,
+    _join_area_side_tables,
    _finalize_listings,
    _integrate_listings,
    _match_direct_epc,
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
    assert loaded["_actual_lat"].to_list() == [51.5]


+def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
+    # A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
+    # the same digits-only key as `_normalize_uprn` on the candidate side, so
+    # the exact UPRN match is not lost. Naively stringifying "100023336956.0"
+    # and stripping non-digits would yield "1000233369560" (a bogus trailing
+    # zero) which never collides with the candidate key "100023336956".
+    listings_path = tmp_path / "listings.parquet"
+    arcgis_path = tmp_path / "arcgis.parquet"
+    _sample_listings_frame().with_columns(
+        pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
+    ).write_parquet(listings_path)
+    _stub_arcgis(arcgis_path)
+
+    loaded = _load_listings_for_merge(listings_path, arcgis_path)
+
+    assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
+    assert loaded["_listing_uprn"].to_list() == ["100023336956"]
+
+
 def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
    tmp_path,
 ) -> None:
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
    assert _normalize_uprn(float("nan")) is None


+def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
+    # The raw property value is fill_null("No") upstream, so a plain coalesce
+    # would let a non-null "No" override a directly-matched listing "Yes".
+    # "Former council house" should fire if EITHER side says "Yes".
+    none_col = [None] * 5
+    wide = pl.LazyFrame(
+        {
+            "was_council_house": ["No", "Yes", "No", None, None],
+            "_direct_was_council_house": ["Yes", "No", None, "Yes", None],
+            # An unrelated direct-EPC column keeps the plain-coalesce behaviour.
+            "current_energy_rating": [None, "C", "D", None, None],
+            "_direct_current_energy_rating": ["B", "A", None, "E", None],
+            # _coalesce_direct_epc_columns coalesces every pair in
+            # _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
+            "epc_address": none_col,
+            "_direct_epc_address": none_col,
+            "potential_energy_rating": none_col,
+            "_direct_potential_energy_rating": none_col,
+            "total_floor_area": none_col,
+            "_direct_total_floor_area": none_col,
+            "number_habitable_rooms": none_col,
+            "_direct_number_habitable_rooms": none_col,
+            "floor_height": none_col,
+            "_direct_floor_height": none_col,
+            "construction_age_band": none_col,
+            "_direct_construction_age_band": none_col,
+            "is_construction_date_approximate": none_col,
+            "_direct_is_construction_date_approximate": none_col,
+        }
+    )
+
+    result = _coalesce_direct_epc_columns(wide).collect()
+
+    assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
+    # Plain coalesce (raw wins when non-null) is untouched for other columns.
+    assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
+
+
+def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
+    # The crime table is LEFT-joined per postcode; a postcode absent from it
+    # must NOT be fabricated as "zero crime" (the safest value). When every
+    # per-type column is null the Serious/Minor rollups must stay null.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
+
+    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA"],
+            "Violence and sexual offences (avg/yr)": [1.0],
+            "Robbery (avg/yr)": [2.0],
+            "Burglary (avg/yr)": [3.0],
+            "Possession of weapons (avg/yr)": [4.0],
+            "Anti-social behaviour (avg/yr)": [1.0],
+            "Criminal damage and arson (avg/yr)": [1.0],
+            "Shoplifting (avg/yr)": [1.0],
+            "Bicycle theft (avg/yr)": [1.0],
+            "Theft from the person (avg/yr)": [1.0],
+            "Other theft (avg/yr)": [1.0],
+            "Vehicle crime (avg/yr)": [1.0],
+            "Public order (avg/yr)": [1.0],
+            "Drugs (avg/yr)": [1.0],
+            "Other crime (avg/yr)": [1.0],
+        }
+    )
+
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
+    ).collect()
+
+    by_postcode = {
+        row["postcode"]: row
+        for row in joined.select(
+            "postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
+        ).iter_rows(named=True)
+    }
+    # Present postcode: rollups are the component sums (1+2+3+4, 10×1).
+    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
+    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
+    # Missing postcode: rollups stay null rather than fabricating 0.0.
+    assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
+    assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
+
+
 def _property_candidates(rows: list[dict]) -> pl.DataFrame:
    base = {
        "postcode": "AA1 1AA",
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -1,9 +1,44 @@
 import polars as pl

 from pipeline.transform.poi_proximity import (
+    POI_GROUPS_2KM,
    _build_poi_category_groups,
    _dynamic_poi_metric_renames,
+    _groceries_categories,
 )
+from pipeline.utils.poi_counts import count_pois_per_postcode
+
+
+def test_groceries_2km_counts_geolytix_brand_categories() -> None:
+    """The static groceries 2km count must include GEOLYTIX brand POIs.
+
+    GEOLYTIX stores the brand (e.g. "Tesco") in `category` with group
+    "Groceries" and never emits the literal "Supermarket"; matching only the
+    OSM strings counts the supermarket but drops the brand store.
+    """
+    postcodes = pl.DataFrame(
+        {
+            "postcode": ["SW1A 1AA"],
+            "lat": [51.5010],
+            "lon": [-0.1416],
+        }
+    )
+    pois = pl.DataFrame(
+        {
+            "category": ["Tesco", "Supermarket"],
+            "group": ["Groceries", "Groceries"],
+            "lat": [51.5011, 51.5012],
+            "lng": [-0.1417, -0.1418],
+        }
+    )
+
+    groups_2km = {**POI_GROUPS_2KM, "groceries": _groceries_categories(pois)}
+    result = count_pois_per_postcode(postcodes, pois, groups=groups_2km, radius_km=2)
+
+    # Both the GEOLYTIX brand ("Tesco") and the OSM "Supermarket" must count.
+    # Pre-fix the static list was ["Greengrocer", "Supermarket", "Convenience
+    # Store"], so "Tesco" was dropped and this was 1.
+    assert result["groceries_2km"][0] == 2


 def test_dynamic_poi_groups_include_requested_categories_only() -> None:
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -1,6 +1,10 @@
 import polars as pl

-from pipeline.transform.transform_poi import transform_grocery_retail_points
+from pipeline.transform.transform_poi import (
+    _load_ofsted_ratings,
+    _school_icon_category_expr,
+    transform_grocery_retail_points,
+)


 def test_transform_grocery_retail_points_outputs_chain_categories():
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
    ]


+def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
+    # Each Co-op society has <5 in-England stores; only after normalising to the
+    # shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
+    societies = [
+        "Central England Co-operative",
+        "Lincolnshire Co-operative",
+        "The Southern Co-operative",
+        "Midcounties Co-operative",
+        "Heart of England Co-operative",
+    ]
+    raw = pl.DataFrame(
+        {
+            "id": list(range(1, len(societies) + 1)),
+            "retailer": societies,
+            "fascia": ["The Co-operative Food"] * len(societies),
+            "store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
+            "long_wgs": [-0.141] * len(societies),
+            "lat_wgs": [51.515] * len(societies),
+        }
+    )
+
+    pois = transform_grocery_retail_points(raw)
+
+    assert pois.height == len(societies)
+    assert pois["category"].unique().to_list() == ["Co-op"]
+
+
 def test_transform_grocery_retail_points_accepts_base_fascias():
    raw = pl.DataFrame(
        {
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
        {"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
        {"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
    ]
+
+
+def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
+    # URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
+    # grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
+    # the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
+    # URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
+    ofsted_path = tmp_path / "ofsted.parquet"
+    pl.DataFrame(
+        {
+            "URN": [1, 2, 3, 4, 5, 6, 7, 8],
+            "Latest OEIF overall effectiveness": [
+                "1",
+                "2",
+                "3",
+                "4",
+                None,
+                "Not judged",
+                "Not judged",
+                "3",
+            ],
+            "Ungraded inspection overall outcome": [
+                None,
+                None,
+                None,
+                None,
+                "School remains Outstanding",
+                "School remains Good (Concerns)",
+                None,
+                "School remains Outstanding",
+            ],
+        }
+    ).write_parquet(ofsted_path)
+
+    ratings = (
+        _load_ofsted_ratings(ofsted_path)
+        .collect()
+        .sort("urn")
+        .to_dicts()
+    )
+
+    assert ratings == [
+        {"urn": 1, "ofsted_rating": "Outstanding"},
+        {"urn": 2, "ofsted_rating": "Good"},
+        {"urn": 3, "ofsted_rating": "Requires improvement"},
+        {"urn": 4, "ofsted_rating": "Inadequate"},
+        {"urn": 5, "ofsted_rating": "Outstanding"},
+        {"urn": 6, "ofsted_rating": "Good"},
+        {"urn": 7, "ofsted_rating": "Not judged"},
+        {"urn": 8, "ofsted_rating": "Requires improvement"},
+    ]
+
+
+def test_school_icon_category_handles_one_sided_age_ranges():
+    # gias._format_age_range emits "up to {high}", "{low}+" and "{low}–{high}".
+    # All three (plus null) must classify, not fall through to "School".
+    df = pl.DataFrame(
+        {
+            "phase": [None, None, None, None, None],
+            "type_group": [None, None, None, None, None],
+            # "up to 5" -> nursery; "16+" -> sixth form; "3–18" -> all-through;
+            # "4–11" -> primary; null age_range with null phase -> "School".
+            "age_range": ["up to 5", "16+", "3–18", "4–11", None],
+        },
+        # Production reads these from a scanned parquet as String; an all-null
+        # Python list would otherwise infer the Null dtype and break .str ops.
+        schema_overrides={
+            "phase": pl.String,
+            "type_group": pl.String,
+            "age_range": pl.String,
+        },
+    )
+
+    categories = df.select(
+        _school_icon_category_expr().alias("category")
+    )["category"].to_list()
+
+    assert categories == [
+        "Nursery school",
+        "Sixth form",
+        "All-through school",
+        "Primary school",
+        "School",
+    ]
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
        )
        df = df.filter(pl.Series(mask))

-    eligible_retailers = (
-        df.group_by("retailer")
+    # Normalise to the display brand FIRST so the ~16 Co-op society retailer
+    # names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
+    # small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
+    df = df.with_columns(
+        pl.col("retailer")
+        .map_elements(normalize_grocery_retailer, return_dtype=pl.String)
+        .alias("category")
+    )
+    eligible_categories = (
+        df.group_by("category")
        .len()
        .filter(pl.col("len") >= min_chain_locations)
-        .select("retailer")
+        .select("category")
    )
-    df = df.join(eligible_retailers, on="retailer", how="semi")
+    df = df.join(eligible_categories, on="category", how="semi")

    return df.with_columns(
        pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
        pl.coalesce(["store_name", "fascia", "retailer"])
        .str.replace_all("''", "'")
        .alias("name"),
-        pl.col("retailer")
-        .map_elements(normalize_grocery_retailer, return_dtype=pl.String)
-        .alias("category"),
        pl.struct(["fascia", "retailer"])
        .map_elements(
            lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
    # GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
    # primary") so we normalise before matching.
    phase = pl.col("phase").str.to_lowercase()
-    # age_range is "<min>–<max>" using an em-dash; both ends may be missing.
-    age_parts = pl.col("age_range").str.split_exact("–", 1)
-    min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
-    max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
+    # gias._format_age_range emits three shapes: "<low>–<high>" (em-dash),
+    # "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
+    # integer as low and the trailing integer as high, then suppress the wrong
+    # end for the one-sided shapes so they don't collapse to a single bound.
+    age = pl.col("age_range")
+    leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
+    trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
+    # "up to N": no low bound; "N+": no high bound.
+    min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
+    max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
    return (
        pl.when(pl.col("type_group") == "Universities")
        .then(pl.lit("University"))
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
 def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    """Project the latest OEIF effectiveness grade to a human-readable label,
    keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
-    the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
-    only have a report card) is preserved verbatim; null grades drop out."""
+    the conventional Ofsted labels; when there is no usable graded result
+    (null/"Not judged", e.g. schools last seen under the post-2024 ungraded
+    report-card framework) we fall back to "Ungraded inspection overall outcome"
+    so genuinely good/outstanding schools aren't dropped — mirroring
+    school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
    grade_col = pl.col("Latest OEIF overall effectiveness")
+    # See school_proximity: the ungraded outcome carries "School remains Good"/
+    # "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
+    # suffixes) when the graded column is null/"Not judged".
+    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
    label = (
        pl.when(grade_col == "1")
        .then(pl.lit(OFSTED_OEIF_LABELS["1"]))
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
        .then(pl.lit(OFSTED_OEIF_LABELS["3"]))
        .when(grade_col == "4")
        .then(pl.lit(OFSTED_OEIF_LABELS["4"]))
+        .when(ungraded.str.starts_with("School remains Outstanding"))
+        .then(pl.lit(OFSTED_OEIF_LABELS["1"]))
+        .when(ungraded.str.starts_with("School remains Good"))
+        .then(pl.lit(OFSTED_OEIF_LABELS["2"]))
        .when(grade_col == "Not judged")
        .then(pl.lit("Not judged"))
        .otherwise(None)