Add postcode boundary calculation

2026-02-07 21:22:53 +00:00 · 2026-02-07 21:22:53 +00:00 · f5e6894c0f
commit f5e6894c0f
parent f9bd218a3e
14 changed files with 1384 additions and 717 deletions
--- a/pipeline/transform/postcode_boundaries/inspire.py
+++ b/pipeline/transform/postcode_boundaries/inspire.py
@ -0,0 +1,155 @@
+import zipfile
+from pathlib import Path
+from xml.etree.ElementTree import iterparse
+
+import numpy as np
+from shapely import make_valid
+from shapely.geometry import Polygon
+from tqdm import tqdm
+
+_GML_NS = "{http://www.opengis.net/gml/3.2}"
+_LR_NS = "{www.landregistry.gov.uk}"
+
+
+def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
+    """Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
+    results = []
+    with zipfile.ZipFile(zip_path) as zf:
+        gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
+        if not gml_names:
+            return results
+        with zf.open(gml_names[0]) as f:
+            for event, elem in iterparse(f, events=("end",)):
+                if elem.tag != f"{_LR_NS}PREDEFINED":
+                    continue
+                pos_list = elem.find(f".//{_GML_NS}posList")
+                if pos_list is not None and pos_list.text:
+                    vals = pos_list.text.split()
+                    n = len(vals) // 2
+                    if n >= 3:
+                        coords = np.array(vals, dtype=np.float64).reshape(n, 2)
+                        results.append(coords)
+                elem.clear()
+    return results
+
+
+def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
+    """Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
+
+    Processes ZIPs sequentially to keep memory under control (~2GB peak).
+    Each ZIP's polygons are streamed to disk immediately after parsing.
+
+    Writes three files:
+      - inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
+      - inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
+      - inspire_coords.bin: flat binary of all float64 coordinate pairs
+    """
+    zip_files = sorted(inspire_dir.glob("*.zip"))
+    print(
+        f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
+    )
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Pre-allocate arrays for bboxes and offsets (grow if needed)
+    capacity = 25_000_000
+    bboxes = np.empty((capacity, 4), dtype=np.float64)
+    offsets = np.empty((capacity, 2), dtype=np.int64)
+    count = 0
+    byte_offset = 0
+
+    coords_path = cache_dir / "inspire_coords.bin"
+    with open(coords_path, "wb") as cf:
+        for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
+            for coords in parse_inspire_zip(zip_path):
+                if count >= capacity:
+                    capacity = int(capacity * 1.5)
+                    bboxes.resize((capacity, 4), refcheck=False)
+                    offsets.resize((capacity, 2), refcheck=False)
+                bboxes[count, 0] = coords[:, 0].min()
+                bboxes[count, 1] = coords[:, 1].min()
+                bboxes[count, 2] = coords[:, 0].max()
+                bboxes[count, 3] = coords[:, 1].max()
+                offsets[count, 0] = byte_offset
+                offsets[count, 1] = len(coords)
+                raw = coords.astype(np.float64).tobytes()
+                cf.write(raw)
+                byte_offset += len(raw)
+                count += 1
+
+    # Trim to actual size and save
+    bboxes = bboxes[:count]
+    offsets = offsets[:count]
+    np.save(cache_dir / "inspire_bboxes.npy", bboxes)
+    np.save(cache_dir / "inspire_offsets.npy", offsets)
+    size_mb = byte_offset / (1024 * 1024)
+    print(f"  Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
+
+
+def inspire_cache_exists(cache_dir: Path) -> bool:
+    return all(
+        (cache_dir / f).exists()
+        for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
+    )
+
+
+def load_inspire(
+    cache_dir: Path,
+) -> tuple[np.ndarray, np.ndarray, np.memmap]:
+    """Load INSPIRE cache → (bboxes, offsets, coords_mmap).
+
+    Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
+    """
+    print(f"Loading INSPIRE cache from {cache_dir}...")
+    bboxes = np.load(cache_dir / "inspire_bboxes.npy")
+    offsets = np.load(cache_dir / "inspire_offsets.npy")
+    coords_mmap = np.memmap(
+        cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
+    )
+    print(
+        f"  Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
+    )
+    print(f"  Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
+    return bboxes, offsets, coords_mmap
+
+
+def get_inspire_candidates(
+    oa_bounds: tuple[float, float, float, float],
+    bboxes: np.ndarray,
+    offsets: np.ndarray,
+    coords_mmap: np.memmap,
+) -> list[Polygon]:
+    """Get INSPIRE polygons overlapping an OA via bbox pre-filter.
+
+    Builds Shapely objects only for matches (typically 10-500 per OA).
+    Reads coordinate data on-demand from memory-mapped file.
+    """
+    min_e, min_n, max_e, max_n = oa_bounds
+
+    # Vectorized bbox overlap test
+    mask = (
+        (bboxes[:, 2] >= min_e)
+        & (bboxes[:, 0] <= max_e)
+        & (bboxes[:, 3] >= min_n)
+        & (bboxes[:, 1] <= max_n)
+    )
+    idxs = np.where(mask)[0]
+    if len(idxs) == 0:
+        return []
+
+    # Build Shapely polygons only for candidates (coords from mmap)
+    candidates = []
+    for i in idxs:
+        byte_offset = offsets[i, 0]
+        n_pts = offsets[i, 1]
+        float_offset = byte_offset // 8  # float64 = 8 bytes
+        coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
+        poly = Polygon(coords)
+        if not poly.is_valid:
+            poly = make_valid(poly)
+            if poly.geom_type == "MultiPolygon":
+                poly = max(poly.geoms, key=lambda g: g.area)
+            elif poly.geom_type != "Polygon":
+                continue
+        if not poly.is_empty:
+            candidates.append(poly)
+    return candidates