Add postcode mapping

2026-02-07 19:28:57 +00:00 · 2026-02-07 19:28:57 +00:00 · 4506263e5b
commit 4506263e5b
parent e7f2d1ffc3
5 changed files with 966 additions and 0 deletions
--- a/pipeline/transform/postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries.py
@ -0,0 +1,715 @@
+"""Generate postcode boundary polygons from OA boundaries, INSPIRE parcels, and UPRN data.
+
+Produces per-district GeoJSON files compatible with the Rust server's postcode loader.
+Each postcode gets a polygon (or MultiPolygon) guaranteed to be contained within its
+Output Area(s), with 100% OA coverage and no overlaps between postcodes within an OA.
+
+Algorithm per OA:
+  1. Single-postcode OA → entire OA polygon assigned to that postcode
+  2. Multi-postcode OA:
+     a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote
+     b. Union INSPIRE parcels per postcode, clip to OA → "claimed" area
+     c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points
+     d. Final polygon = claimed + Voronoi share
+
+Memory-efficient design (<12GB total):
+  - INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built
+    lazily per-OA via numpy bbox pre-filter (~100-500 candidates at a time)
+  - UPRNs kept as sorted polars DataFrame with offset dict (Arrow storage, ~1.2GB)
+  - OA processing runs sequentially (no multiprocess INSPIRE duplication)
+
+Output format: {output}/units/{DISTRICT}.geojson with properties.postcodes and
+properties.mapit_code fields matching server-rs/src/data/postcodes.rs expectations.
+"""
+
+import argparse
+import ctypes
+import gc
+import json
+import sqlite3
+import zipfile
+from collections import Counter, defaultdict
+from pathlib import Path
+from xml.etree.ElementTree import iterparse
+
+import numpy as np
+import polars as pl
+from pyproj import Transformer
+from scipy.spatial import Voronoi
+from shapely import STRtree, make_valid, wkb
+from shapely.geometry import MultiPolygon, Polygon
+from shapely.ops import unary_union
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# GeoPackage helpers
+# ---------------------------------------------------------------------------
+
+
+def _release_memory() -> None:
+    """Force Python + glibc to release freed memory back to the OS."""
+    gc.collect()
+    try:
+        ctypes.CDLL("libc.so.6").malloc_trim(0)
+    except OSError:
+        pass
+
+
+_ENVELOPE_SIZES = {0: 0, 1: 32, 2: 48, 3: 48, 4: 64}
+
+
+def parse_gpkg_geometry(blob: bytes):
+    """Extract a Shapely geometry from a GeoPackage binary blob."""
+    flags = blob[3]
+    envelope_type = (flags >> 1) & 0x07
+    header_size = 8 + _ENVELOPE_SIZES[envelope_type]
+    return wkb.loads(blob[header_size:])
+
+
+def load_oa_boundaries(gpkg_path: Path) -> dict[str, Polygon | MultiPolygon]:
+    """Load OA boundary polygons from a GeoPackage. Geometry is already in BNG."""
+    print("Loading OA boundaries...")
+
+    conn = sqlite3.connect(str(gpkg_path))
+    cur = conn.cursor()
+    cur.execute("SELECT OA21CD, SHAPE FROM OA_2021_EW_BGC_V2")
+
+    oa_geoms: dict[str, Polygon | MultiPolygon] = {}
+    for oa_code, blob in cur:
+        geom = parse_gpkg_geometry(bytes(blob))
+        if geom.geom_type == "MultiPolygon" and len(geom.geoms) == 1:
+            geom = geom.geoms[0]
+        oa_geoms[oa_code] = geom
+
+    conn.close()
+    print(f"  Loaded {len(oa_geoms)} OA boundaries")
+    return oa_geoms
+
+
+# ---------------------------------------------------------------------------
+# UPRN loading (memory-efficient: sorted polars DataFrame + offset dict)
+# ---------------------------------------------------------------------------
+
+
+def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
+    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.
+
+    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
+    Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
+    """
+    import tempfile
+
+    print("Loading UPRN lookup...")
+
+    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
+    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    (
+        pl.scan_parquet(uprn_path)
+        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
+        .filter(~pl.col("OA21CD").str.starts_with("S"))
+        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
+        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
+        .with_columns(pl.col("PCDS").str.strip_chars())
+        .sort("OA21CD")
+        .sink_parquet(tmp_path)
+    )
+    _release_memory()
+
+    # Read the sorted data — only one copy in memory (~2GB)
+    df = pl.read_parquet(tmp_path)
+    tmp_path.unlink()
+    n = len(df)
+    print(f"  Loaded {n:,} UPRNs (England & Wales)")
+
+    # Compute OA group offsets using polars (avoids 37M Python string creation)
+    boundary_df = (
+        df.lazy()
+        .with_row_index("_i")
+        .filter(pl.col("OA21CD") != pl.col("OA21CD").shift(1))
+        .select("_i", "OA21CD")
+        .collect()
+    )
+    starts_list = boundary_df["_i"].to_list()
+    oa_list = boundary_df["OA21CD"].to_list()
+    del boundary_df
+    offsets: dict[str, tuple[int, int]] = {}
+    for j in range(len(starts_list)):
+        end = starts_list[j + 1] if j + 1 < len(starts_list) else n
+        offsets[oa_list[j]] = (starts_list[j], end)
+    del starts_list, oa_list
+
+    # Drop OA column (no longer needed) to save ~400MB
+    df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
+    _release_memory()
+
+    print(f"  Grouped into {len(offsets)} OAs")
+    return df, offsets
+
+
+def get_oa_uprns(
+    df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
+) -> tuple[np.ndarray, list[str]]:
+    """Get UPRN coordinates and postcodes for a single OA.
+
+    Returns (points_nx2, postcodes_list).
+    """
+    s, e = offsets[oa_code]
+    sub = df[s:e]
+    points = np.column_stack(
+        [
+            sub["GRIDGB1E"].to_numpy(),
+            sub["GRIDGB1N"].to_numpy(),
+        ]
+    )
+    postcodes = sub["PCDS"].to_list()
+    return points, postcodes
+
+
+# ---------------------------------------------------------------------------
+# INSPIRE GML parsing and caching
+# ---------------------------------------------------------------------------
+
+_GML_NS = "{http://www.opengis.net/gml/3.2}"
+_LR_NS = "{www.landregistry.gov.uk}"
+
+
+def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
+    """Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
+    results = []
+    with zipfile.ZipFile(zip_path) as zf:
+        gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
+        if not gml_names:
+            return results
+        with zf.open(gml_names[0]) as f:
+            for event, elem in iterparse(f, events=("end",)):
+                if elem.tag != f"{_LR_NS}PREDEFINED":
+                    continue
+                pos_list = elem.find(f".//{_GML_NS}posList")
+                if pos_list is not None and pos_list.text:
+                    vals = pos_list.text.split()
+                    n = len(vals) // 2
+                    if n >= 3:
+                        coords = np.array(vals, dtype=np.float64).reshape(n, 2)
+                        results.append(coords)
+                elem.clear()
+    return results
+
+
+def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
+    """Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
+
+    Processes ZIPs sequentially to keep memory under control (~2GB peak).
+    Each ZIP's polygons are streamed to disk immediately after parsing.
+
+    Writes three files:
+      - inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
+      - inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
+      - inspire_coords.bin: flat binary of all float64 coordinate pairs
+    """
+    zip_files = sorted(inspire_dir.glob("*.zip"))
+    print(
+        f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
+    )
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Pre-allocate arrays for bboxes and offsets (grow if needed)
+    capacity = 25_000_000
+    bboxes = np.empty((capacity, 4), dtype=np.float64)
+    offsets = np.empty((capacity, 2), dtype=np.int64)
+    count = 0
+    byte_offset = 0
+
+    coords_path = cache_dir / "inspire_coords.bin"
+    with open(coords_path, "wb") as cf:
+        for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
+            for coords in parse_inspire_zip(zip_path):
+                if count >= capacity:
+                    capacity = int(capacity * 1.5)
+                    bboxes.resize((capacity, 4), refcheck=False)
+                    offsets.resize((capacity, 2), refcheck=False)
+                bboxes[count, 0] = coords[:, 0].min()
+                bboxes[count, 1] = coords[:, 1].min()
+                bboxes[count, 2] = coords[:, 0].max()
+                bboxes[count, 3] = coords[:, 1].max()
+                offsets[count, 0] = byte_offset
+                offsets[count, 1] = len(coords)
+                raw = coords.astype(np.float64).tobytes()
+                cf.write(raw)
+                byte_offset += len(raw)
+                count += 1
+
+    # Trim to actual size and save
+    bboxes = bboxes[:count]
+    offsets = offsets[:count]
+    np.save(cache_dir / "inspire_bboxes.npy", bboxes)
+    np.save(cache_dir / "inspire_offsets.npy", offsets)
+    size_mb = byte_offset / (1024 * 1024)
+    print(f"  Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
+
+
+def _inspire_cache_exists(cache_dir: Path) -> bool:
+    return all(
+        (cache_dir / f).exists()
+        for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
+    )
+
+
+def load_inspire(
+    cache_dir: Path,
+) -> tuple[np.ndarray, np.ndarray, np.memmap]:
+    """Load INSPIRE cache → (bboxes, offsets, coords_mmap).
+
+    Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
+    """
+    print(f"Loading INSPIRE cache from {cache_dir}...")
+    bboxes = np.load(cache_dir / "inspire_bboxes.npy")
+    offsets = np.load(cache_dir / "inspire_offsets.npy")
+    coords_mmap = np.memmap(
+        cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
+    )
+    print(
+        f"  Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
+    )
+    print(f"  Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
+    return bboxes, offsets, coords_mmap
+
+
+def get_inspire_candidates(
+    oa_bounds: tuple[float, float, float, float],
+    bboxes: np.ndarray,
+    offsets: np.ndarray,
+    coords_mmap: np.memmap,
+) -> list[Polygon]:
+    """Get INSPIRE polygons overlapping an OA via bbox pre-filter.
+
+    Builds Shapely objects only for matches (typically 10-500 per OA).
+    Reads coordinate data on-demand from memory-mapped file.
+    """
+    min_e, min_n, max_e, max_n = oa_bounds
+
+    # Vectorized bbox overlap test
+    mask = (
+        (bboxes[:, 2] >= min_e)
+        & (bboxes[:, 0] <= max_e)
+        & (bboxes[:, 3] >= min_n)
+        & (bboxes[:, 1] <= max_n)
+    )
+    idxs = np.where(mask)[0]
+    if len(idxs) == 0:
+        return []
+
+    # Build Shapely polygons only for candidates (coords from mmap)
+    candidates = []
+    for i in idxs:
+        byte_offset = offsets[i, 0]
+        n_pts = offsets[i, 1]
+        float_offset = byte_offset // 8  # float64 = 8 bytes
+        coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
+        poly = Polygon(coords)
+        if not poly.is_valid:
+            poly = make_valid(poly)
+            if poly.geom_type == "MultiPolygon":
+                poly = max(poly.geoms, key=lambda g: g.area)
+            elif poly.geom_type != "Polygon":
+                continue
+        if not poly.is_empty:
+            candidates.append(poly)
+    return candidates
+
+
+# ---------------------------------------------------------------------------
+# Voronoi computation
+# ---------------------------------------------------------------------------
+
+
+def compute_voronoi_regions(
+    points: np.ndarray, postcodes: list[str], boundary: Polygon | MultiPolygon
+) -> dict[str, Polygon | MultiPolygon]:
+    """Compute Voronoi regions for points, clipped to boundary, grouped by postcode."""
+    if len(points) == 0:
+        return {}
+    if len(points) == 1:
+        return {postcodes[0]: boundary}
+
+    # Deduplicate points per postcode (flats at same coords)
+    seen: dict[tuple[float, float], str] = {}
+    unique_pts = []
+    unique_pcs = []
+    for i in range(len(points)):
+        key = (points[i, 0], points[i, 1])
+        if key not in seen:
+            seen[key] = postcodes[i]
+            unique_pts.append(points[i])
+            unique_pcs.append(postcodes[i])
+
+    if len(unique_pts) == 1:
+        return {unique_pcs[0]: boundary}
+
+    pts = np.array(unique_pts)
+    min_e, min_n = pts.min(axis=0)
+    max_e, max_n = pts.max(axis=0)
+    span = max(max_e - min_e, max_n - min_n, 100)
+
+    dummy = np.array(
+        [
+            [min_e - span * 10, min_n - span * 10],
+            [max_e + span * 10, min_n - span * 10],
+            [min_e - span * 10, max_n + span * 10],
+            [max_e + span * 10, max_n + span * 10],
+        ]
+    )
+    all_points = np.vstack([pts, dummy])
+
+    try:
+        vor = Voronoi(all_points)
+    except Exception:
+        return {unique_pcs[0]: boundary}
+
+    n_real = len(pts)
+    pc_polys: dict[str, list[Polygon]] = defaultdict(list)
+
+    for i in range(n_real):
+        region_idx = vor.point_region[i]
+        region = vor.regions[region_idx]
+        if -1 in region or len(region) < 3:
+            continue
+        vertices = vor.vertices[region]
+        poly = Polygon(vertices)
+        if not poly.is_valid:
+            poly = make_valid(poly)
+        clipped = poly.intersection(boundary)
+        if not clipped.is_empty:
+            pc_polys[unique_pcs[i]].append(clipped)
+
+    result = {}
+    for pc, parts in pc_polys.items():
+        merged = unary_union(parts)
+        if not merged.is_empty:
+            result[pc] = merged
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Per-OA processing
+# ---------------------------------------------------------------------------
+
+
+def process_oa(
+    oa_geom: Polygon | MultiPolygon,
+    points: np.ndarray,
+    postcodes: list[str],
+    inspire_candidates: list[Polygon],
+) -> list[tuple[str, Polygon | MultiPolygon]]:
+    """Process a single OA → list of (postcode, geometry) fragments."""
+    unique_pcs = set(postcodes)
+    if len(unique_pcs) == 1:
+        return [(next(iter(unique_pcs)), oa_geom)]
+
+    # Try INSPIRE-based assignment
+    claimed: dict[str, Polygon | MultiPolygon] = {}
+
+    if inspire_candidates:
+        cand_tree = STRtree(inspire_candidates)
+
+        from shapely import points as shp_points
+
+        uprn_pts = shp_points(points)
+        pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects")
+
+        # Majority vote per candidate polygon
+        cand_postcodes: dict[int, list[str]] = defaultdict(list)
+        for pi, ci in zip(pt_idx, cand_idx):
+            cand_postcodes[ci].append(postcodes[pi])
+
+        pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list)
+        for ci, pc_list in cand_postcodes.items():
+            winner = Counter(pc_list).most_common(1)[0][0]
+            pc_inspire_polys[winner].append(inspire_candidates[ci])
+
+        for pc, polys in pc_inspire_polys.items():
+            merged = unary_union(polys)
+            clipped = merged.intersection(oa_geom)
+            if not clipped.is_empty:
+                if not clipped.is_valid:
+                    clipped = make_valid(clipped)
+                claimed[pc] = clipped
+
+    # Compute remaining area
+    if claimed:
+        all_claimed = unary_union(list(claimed.values()))
+        if not all_claimed.is_valid:
+            all_claimed = make_valid(all_claimed)
+        remaining = oa_geom.difference(all_claimed)
+        if not remaining.is_valid:
+            remaining = make_valid(remaining)
+    else:
+        remaining = oa_geom
+
+    # Distribute remaining area via Voronoi
+    if not remaining.is_empty and remaining.area > 0.01:
+        voronoi_result = compute_voronoi_regions(points, postcodes, remaining)
+    else:
+        voronoi_result = {}
+
+    # Combine claimed + voronoi
+    result: dict[str, list] = defaultdict(list)
+    for pc, geom in claimed.items():
+        result[pc].append(geom)
+    for pc, geom in voronoi_result.items():
+        result[pc].append(geom)
+
+    fragments = []
+    for pc, parts in result.items():
+        merged = unary_union(parts)
+        if not merged.is_empty:
+            if not merged.is_valid:
+                merged = make_valid(merged)
+            fragments.append((pc, merged))
+
+    return fragments
+
+
+# ---------------------------------------------------------------------------
+# Output: merge fragments and write GeoJSON
+# ---------------------------------------------------------------------------
+
+_to_wgs84 = None
+
+
+def _get_to_wgs84():
+    global _to_wgs84
+    if _to_wgs84 is None:
+        _to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
+    return _to_wgs84
+
+
+def to_wgs84_geojson(
+    geom: Polygon | MultiPolygon, tolerance: float = 1.0
+) -> dict | None:
+    """Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
+    if geom.is_empty:
+        return None
+
+    simplified = geom.simplify(tolerance, preserve_topology=True)
+    if simplified.is_empty:
+        return None
+
+    transformer = _get_to_wgs84()
+
+    def transform_ring(coords):
+        xs, ys = zip(*coords)
+        lons, lats = transformer.transform(list(xs), list(ys))
+        return list(zip(lons, lats))
+
+    def transform_polygon(poly):
+        exterior = transform_ring(poly.exterior.coords)
+        holes = [transform_ring(h.coords) for h in poly.interiors]
+        return [exterior] + holes
+
+    if simplified.geom_type == "Polygon":
+        return {
+            "type": "Polygon",
+            "coordinates": transform_polygon(simplified),
+        }
+    elif simplified.geom_type == "MultiPolygon":
+        return {
+            "type": "MultiPolygon",
+            "coordinates": [transform_polygon(p) for p in simplified.geoms],
+        }
+    elif simplified.geom_type == "GeometryCollection":
+        polys = [
+            g for g in simplified.geoms if g.geom_type in ("Polygon", "MultiPolygon")
+        ]
+        if not polys:
+            return None
+        return to_wgs84_geojson(unary_union(polys), tolerance=0)
+    return None
+
+
+def merge_fragments(
+    all_fragments: list[tuple[str, Polygon | MultiPolygon]],
+) -> dict[str, Polygon | MultiPolygon]:
+    """Merge cross-OA fragments for postcodes spanning multiple OAs."""
+    by_postcode: dict[str, list] = defaultdict(list)
+    for pc, geom in all_fragments:
+        by_postcode[pc].append(geom)
+
+    merged = {}
+    for pc, parts in by_postcode.items():
+        combined = unary_union(parts)
+        if combined.is_empty:
+            continue
+        if not combined.is_valid:
+            combined = make_valid(combined)
+        # Close tiny gaps between adjacent OA boundary edges (float mismatches)
+        if combined.geom_type == "MultiPolygon":
+            combined = combined.buffer(1.0).buffer(-1.0)
+            if not combined.is_valid:
+                combined = make_valid(combined)
+        # Postcodes are contiguous delivery routes — keep only the largest
+        # polygon; small detached fragments are algorithm artifacts
+        if combined.geom_type == "MultiPolygon":
+            combined = max(combined.geoms, key=lambda g: g.area)
+        merged[pc] = combined
+    return merged
+
+
+def write_district_geojson(
+    postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
+) -> int:
+    """Group postcodes by district, write GeoJSON files. Returns file count."""
+    units_dir = output_dir / "units"
+    units_dir.mkdir(parents=True, exist_ok=True)
+
+    by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
+    for pc, geom in postcodes.items():
+        parts = pc.split()
+        district = parts[0] if parts else pc[:4]
+        by_district[district].append((pc, geom))
+
+    file_count = 0
+    for district, entries in tqdm(
+        sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
+    ):
+        features = []
+        for pc, geom in sorted(entries, key=lambda x: x[0]):
+            geojson_geom = to_wgs84_geojson(geom)
+            if geojson_geom is None:
+                continue
+            mapit_code = pc.replace(" ", "")
+            features.append(
+                {
+                    "type": "Feature",
+                    "geometry": geojson_geom,
+                    "properties": {
+                        "postcodes": pc,
+                        "mapit_code": mapit_code,
+                    },
+                }
+            )
+
+        if not features:
+            continue
+
+        collection = {"type": "FeatureCollection", "features": features}
+        out_path = units_dir / f"{district}.geojson"
+        with open(out_path, "w") as f:
+            json.dump(collection, f, separators=(",", ":"))
+        file_count += 1
+
+    return file_count
+
+
+# ---------------------------------------------------------------------------
+# Main orchestration
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
+    )
+    parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
+    parser.add_argument(
+        "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
+    )
+    parser.add_argument(
+        "--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
+    )
+    parser.add_argument("--output", type=Path, required=True, help="Output directory")
+    parser.add_argument(
+        "--limit", type=int, default=0, help="Process only first N OAs (0=all)"
+    )
+    args = parser.parse_args()
+
+    # Phase 1: Load all data
+    print("=" * 60)
+    print("Phase 1: Loading data")
+    print("=" * 60)
+
+    oa_geoms = load_oa_boundaries(args.oa_boundaries)
+    uprn_df, uprn_offsets = load_uprns(args.uprn)
+
+    # Phase 2: Parse/load INSPIRE
+    print()
+    print("=" * 60)
+    print("Phase 2: INSPIRE data")
+    print("=" * 60)
+
+    inspire_cache_dir = args.output / "inspire_cache"
+    if not _inspire_cache_exists(inspire_cache_dir):
+        cache_inspire(args.inspire, inspire_cache_dir)
+    inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
+
+    # Phase 3: Process OAs
+    print()
+    print("=" * 60)
+    print("Phase 3: Processing OAs")
+    print("=" * 60)
+
+    # Build work list — precompute which OAs are single vs multi-postcode
+    oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
+    skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
+    skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
+
+    if args.limit > 0:
+        oa_codes_with_data = oa_codes_with_data[: args.limit]
+
+    print(f"  OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
+    print(f"  Skipped (no UPRNs): {skipped_no_uprn}")
+    print(f"  Skipped (no boundary): {skipped_no_boundary}")
+
+    all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
+    single_count = 0
+    multi_count = 0
+
+    for oa_code in tqdm(
+        oa_codes_with_data,
+        desc="Processing OAs",
+        unit="OA",
+        smoothing=0.01,
+        miniters=100,
+    ):
+        oa_geom = oa_geoms[oa_code]
+        points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
+
+        if len(set(postcodes)) == 1:
+            # Fast path: entire OA = one postcode
+            all_fragments.append((postcodes[0], oa_geom))
+            single_count += 1
+            continue
+
+        # Get INSPIRE candidates via bbox pre-filter
+        candidates = get_inspire_candidates(
+            oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
+        )
+
+        fragments = process_oa(oa_geom, points, postcodes, candidates)
+        all_fragments.extend(fragments)
+        multi_count += 1
+
+    print(f"\n  Single-postcode OAs (fast path): {single_count}")
+    print(f"  Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
+    print(f"  Total fragments: {len(all_fragments)}")
+
+    # Free data no longer needed
+    del oa_geoms, uprn_df, uprn_offsets
+    del inspire_bboxes, inspire_offsets, inspire_coords
+
+    # Phase 4: Merge and write
+    print()
+    print("=" * 60)
+    print("Phase 4: Merging fragments and writing GeoJSON")
+    print("=" * 60)
+
+    merged = merge_fragments(all_fragments)
+    print(f"  Merged into {len(merged)} unique postcodes")
+
+    file_count = write_district_geojson(merged, args.output)
+    print(f"\n  Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()