perfect-postcode/pipeline/transform/postcode_boundaries/inspire.py

import zipfile
from pathlib import Path
from xml.etree.ElementTree import iterparse

import numpy as np
from shapely import make_valid
from shapely.geometry import Polygon
from tqdm import tqdm

_GML_NS = "{http://www.opengis.net/gml/3.2}"
_LR_NS = "{www.landregistry.gov.uk}"


def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
    """Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
    results = []
    with zipfile.ZipFile(zip_path) as zf:
        gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
        if not gml_names:
            return results
        with zf.open(gml_names[0]) as f:
            for event, elem in iterparse(f, events=("end",)):
                if elem.tag != f"{_LR_NS}PREDEFINED":
                    continue
                pos_list = elem.find(f".//{_GML_NS}posList")
                if pos_list is not None and pos_list.text:
                    vals = pos_list.text.split()
                    n = len(vals) // 2
                    if n >= 3:
                        coords = np.array(vals, dtype=np.float64).reshape(n, 2)
                        results.append(coords)
                elem.clear()
    return results


def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
    """Parse all INSPIRE ZIPs and cache as memory-mappable binary files.

    Processes ZIPs sequentially to keep memory under control (~2GB peak).
    Each ZIP's polygons are streamed to disk immediately after parsing.

    Writes three files:
      - inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
      - inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
      - inspire_coords.bin: flat binary of all float64 coordinate pairs
    """
    zip_files = sorted(inspire_dir.glob("*.zip"))
    print(
        f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
    )
    cache_dir.mkdir(parents=True, exist_ok=True)

    # Pre-allocate arrays for bboxes and offsets (grow if needed)
    capacity = 25_000_000
    bboxes = np.empty((capacity, 4), dtype=np.float64)
    offsets = np.empty((capacity, 2), dtype=np.int64)
    count = 0
    byte_offset = 0

    coords_path = cache_dir / "inspire_coords.bin"
    with open(coords_path, "wb") as cf:
        for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
            for coords in parse_inspire_zip(zip_path):
                if count >= capacity:
                    capacity = int(capacity * 1.5)
                    bboxes.resize((capacity, 4), refcheck=False)
                    offsets.resize((capacity, 2), refcheck=False)
                bboxes[count, 0] = coords[:, 0].min()
                bboxes[count, 1] = coords[:, 1].min()
                bboxes[count, 2] = coords[:, 0].max()
                bboxes[count, 3] = coords[:, 1].max()
                offsets[count, 0] = byte_offset
                offsets[count, 1] = len(coords)
                raw = coords.astype(np.float64).tobytes()
                cf.write(raw)
                byte_offset += len(raw)
                count += 1

    # Trim to actual size and save
    bboxes = bboxes[:count]
    offsets = offsets[:count]
    np.save(cache_dir / "inspire_bboxes.npy", bboxes)
    np.save(cache_dir / "inspire_offsets.npy", offsets)
    size_mb = byte_offset / (1024 * 1024)
    print(f"  Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")


def inspire_cache_exists(cache_dir: Path) -> bool:
    return all(
        (cache_dir / f).exists()
        for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
    )


def load_inspire(
    cache_dir: Path,
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
    """Load INSPIRE cache → (bboxes, offsets, coords_mmap).

    Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
    """
    print(f"Loading INSPIRE cache from {cache_dir}...")
    bboxes = np.load(cache_dir / "inspire_bboxes.npy")
    offsets = np.load(cache_dir / "inspire_offsets.npy")
    coords_mmap = np.memmap(
        cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
    )
    print(
        f"  Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
    )
    print(f"  Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
    return bboxes, offsets, coords_mmap


# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
# query.
_GRID_CELL_SIZE = 1000.0


class InspireIndex:
    """Uniform-grid spatial index over INSPIRE parcel bounding boxes.

    The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
    OA, ~4 h total over the country). This indexes parcels by grid cell so each
    lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
    bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
    by cell id, located with ``searchsorted``); the few parcels larger than a
    cell are kept in an overflow array tested directly on every query. An exact
    bbox test then runs on the gathered subset and the result is sorted, so the
    candidate set -- and its order -- is byte-for-byte identical to the old scan.
    """

    def __init__(
        self,
        bboxes: np.ndarray,
        offsets: np.ndarray,
        coords_mmap: np.memmap,
        cell_size: float = _GRID_CELL_SIZE,
    ) -> None:
        self._bboxes = bboxes
        self._offsets = offsets
        self._coords = coords_mmap
        self._cell_size = cell_size
        self._origin_x = float(bboxes[:, 0].min())
        self._origin_y = float(bboxes[:, 1].min())
        # Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
        # query's one-cell low-edge widening can never collide with cx-1.
        self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2

        width = bboxes[:, 2] - bboxes[:, 0]
        height = bboxes[:, 3] - bboxes[:, 1]
        small = np.where((width <= cell_size) & (height <= cell_size))[0]
        self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
        self._oversized_bb = bboxes[self._oversized]

        cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
        cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
        cell_id = cx * self._ny + cy
        order = np.argsort(cell_id, kind="stable")
        self._sorted_cells = cell_id[order]
        self._cell_parcels = small[order]

    def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
        """Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
        min_e, min_n, max_e, max_n = oa_bounds
        cs = self._cell_size
        # A small parcel (<= one cell) overlapping the OA has its min-corner no
        # more than one cell below/left of the OA bbox, so widen the low edges by
        # a cell. This keeps the lookup free of false negatives.
        gx0 = int((min_e - cs - self._origin_x) // cs)
        gx1 = int((max_e - self._origin_x) // cs)
        gy_lo = int((min_n - cs - self._origin_y) // cs)
        gy_hi = int((max_n - self._origin_y) // cs)

        parts = []
        ob = self._oversized_bb
        if len(ob):
            mo = (
                (ob[:, 2] >= min_e)
                & (ob[:, 0] <= max_e)
                & (ob[:, 3] >= min_n)
                & (ob[:, 1] <= max_n)
            )
            if mo.any():
                parts.append(self._oversized[mo])

        for gx in range(gx0, gx1 + 1):
            base = gx * self._ny
            lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
            hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
            if hi > lo:
                parts.append(self._cell_parcels[lo:hi])

        if not parts:
            return np.empty(0, dtype=np.int64)
        cand = np.concatenate(parts)
        cb = self._bboxes[cand]
        mask = (
            (cb[:, 2] >= min_e)
            & (cb[:, 0] <= max_e)
            & (cb[:, 3] >= min_n)
            & (cb[:, 1] <= max_n)
        )
        # Sort so the candidate order matches the old full np.where scan exactly.
        return np.sort(cand[mask])

    def candidates(
        self, oa_bounds: tuple[float, float, float, float]
    ) -> list[Polygon]:
        """INSPIRE polygons overlapping an OA, built from the mmap on demand.

        Builds Shapely objects only for matches (typically 10-500 per OA).
        """
        candidates = []
        for i in self.candidate_indices(oa_bounds):
            byte_offset = self._offsets[i, 0]
            n_pts = self._offsets[i, 1]
            float_offset = byte_offset // 8  # float64 = 8 bytes
            coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
            poly = Polygon(coords)
            if not poly.is_valid:
                poly = make_valid(poly)
                if poly.geom_type == "MultiPolygon":
                    poly = max(poly.geoms, key=lambda g: g.area)
                elif poly.geom_type != "Polygon":
                    continue
            if not poly.is_empty:
                candidates.append(poly)
        return candidates


def build_inspire_index(
    bboxes: np.ndarray,
    offsets: np.ndarray,
    coords_mmap: np.memmap,
    cell_size: float = _GRID_CELL_SIZE,
) -> InspireIndex:
    """Build the grid spatial index used for per-OA candidate retrieval."""
    return InspireIndex(bboxes, offsets, coords_mmap, cell_size)