import zipfile from pathlib import Path from xml.etree.ElementTree import iterparse import numpy as np from shapely import make_valid from shapely.geometry import Polygon from tqdm import tqdm _GML_NS = "{http://www.opengis.net/gml/3.2}" _LR_NS = "{www.landregistry.gov.uk}" def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]: """Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing).""" results = [] with zipfile.ZipFile(zip_path) as zf: gml_names = [n for n in zf.namelist() if n.endswith(".gml")] if not gml_names: return results with zf.open(gml_names[0]) as f: for event, elem in iterparse(f, events=("end",)): if elem.tag != f"{_LR_NS}PREDEFINED": continue pos_list = elem.find(f".//{_GML_NS}posList") if pos_list is not None and pos_list.text: vals = pos_list.text.split() n = len(vals) // 2 if n >= 3: coords = np.array(vals, dtype=np.float64).reshape(n, 2) results.append(coords) elem.clear() return results def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None: """Parse all INSPIRE ZIPs and cache as memory-mappable binary files. Processes ZIPs sequentially to keep memory under control (~2GB peak). Each ZIP's polygons are streamed to disk immediately after parsing. Writes three files: - inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n] - inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points] - inspire_coords.bin: flat binary of all float64 coordinate pairs """ zip_files = sorted(inspire_dir.glob("*.zip")) print( f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..." ) cache_dir.mkdir(parents=True, exist_ok=True) # Pre-allocate arrays for bboxes and offsets (grow if needed) capacity = 25_000_000 bboxes = np.empty((capacity, 4), dtype=np.float64) offsets = np.empty((capacity, 2), dtype=np.int64) count = 0 byte_offset = 0 coords_path = cache_dir / "inspire_coords.bin" with open(coords_path, "wb") as cf: for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"): for coords in parse_inspire_zip(zip_path): if count >= capacity: capacity = int(capacity * 1.5) bboxes.resize((capacity, 4), refcheck=False) offsets.resize((capacity, 2), refcheck=False) bboxes[count, 0] = coords[:, 0].min() bboxes[count, 1] = coords[:, 1].min() bboxes[count, 2] = coords[:, 0].max() bboxes[count, 3] = coords[:, 1].max() offsets[count, 0] = byte_offset offsets[count, 1] = len(coords) raw = coords.astype(np.float64).tobytes() cf.write(raw) byte_offset += len(raw) count += 1 # Trim to actual size and save bboxes = bboxes[:count] offsets = offsets[:count] np.save(cache_dir / "inspire_bboxes.npy", bboxes) np.save(cache_dir / "inspire_offsets.npy", offsets) size_mb = byte_offset / (1024 * 1024) print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)") def inspire_cache_exists(cache_dir: Path) -> bool: return all( (cache_dir / f).exists() for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin") ) def load_inspire( cache_dir: Path, ) -> tuple[np.ndarray, np.ndarray, np.memmap]: """Load INSPIRE cache → (bboxes, offsets, coords_mmap). Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped). """ print(f"Loading INSPIRE cache from {cache_dir}...") bboxes = np.load(cache_dir / "inspire_bboxes.npy") offsets = np.load(cache_dir / "inspire_offsets.npy") coords_mmap = np.memmap( cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r" ) print( f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)" ) print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped") return bboxes, offsets, coords_mmap # Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m # and the 99th percentile ~540 m, so almost every parcel fits inside a single # 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every # query. _GRID_CELL_SIZE = 1000.0 class InspireIndex: """Uniform-grid spatial index over INSPIRE parcel bounding boxes. The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per OA, ~4 h total over the country). This indexes parcels by grid cell so each lookup is O(cells_spanned + candidates). Parcels no larger than one cell are bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted by cell id, located with ``searchsorted``); the few parcels larger than a cell are kept in an overflow array tested directly on every query. An exact bbox test then runs on the gathered subset and the result is sorted, so the candidate set -- and its order -- is byte-for-byte identical to the old scan. """ def __init__( self, bboxes: np.ndarray, offsets: np.ndarray, coords_mmap: np.memmap, cell_size: float = _GRID_CELL_SIZE, ) -> None: self._bboxes = bboxes self._offsets = offsets self._coords = coords_mmap self._cell_size = cell_size self._origin_x = float(bboxes[:, 0].min()) self._origin_y = float(bboxes[:, 1].min()) # Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the # query's one-cell low-edge widening can never collide with cx-1. self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2 width = bboxes[:, 2] - bboxes[:, 0] height = bboxes[:, 3] - bboxes[:, 1] small = np.where((width <= cell_size) & (height <= cell_size))[0] self._oversized = np.where((width > cell_size) | (height > cell_size))[0] self._oversized_bb = bboxes[self._oversized] cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64) cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64) cell_id = cx * self._ny + cy order = np.argsort(cell_id, kind="stable") self._sorted_cells = cell_id[order] self._cell_parcels = small[order] def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray: """Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order).""" min_e, min_n, max_e, max_n = oa_bounds cs = self._cell_size # A small parcel (<= one cell) overlapping the OA has its min-corner no # more than one cell below/left of the OA bbox, so widen the low edges by # a cell. This keeps the lookup free of false negatives. gx0 = int((min_e - cs - self._origin_x) // cs) gx1 = int((max_e - self._origin_x) // cs) gy_lo = int((min_n - cs - self._origin_y) // cs) gy_hi = int((max_n - self._origin_y) // cs) parts = [] ob = self._oversized_bb if len(ob): mo = ( (ob[:, 2] >= min_e) & (ob[:, 0] <= max_e) & (ob[:, 3] >= min_n) & (ob[:, 1] <= max_n) ) if mo.any(): parts.append(self._oversized[mo]) for gx in range(gx0, gx1 + 1): base = gx * self._ny lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left") hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right") if hi > lo: parts.append(self._cell_parcels[lo:hi]) if not parts: return np.empty(0, dtype=np.int64) cand = np.concatenate(parts) cb = self._bboxes[cand] mask = ( (cb[:, 2] >= min_e) & (cb[:, 0] <= max_e) & (cb[:, 3] >= min_n) & (cb[:, 1] <= max_n) ) # Sort so the candidate order matches the old full np.where scan exactly. return np.sort(cand[mask]) def candidates( self, oa_bounds: tuple[float, float, float, float] ) -> list[Polygon]: """INSPIRE polygons overlapping an OA, built from the mmap on demand. Builds Shapely objects only for matches (typically 10-500 per OA). """ candidates = [] for i in self.candidate_indices(oa_bounds): byte_offset = self._offsets[i, 0] n_pts = self._offsets[i, 1] float_offset = byte_offset // 8 # float64 = 8 bytes coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2) poly = Polygon(coords) if not poly.is_valid: poly = make_valid(poly) if poly.geom_type == "MultiPolygon": poly = max(poly.geoms, key=lambda g: g.area) elif poly.geom_type != "Polygon": continue if not poly.is_empty: candidates.append(poly) return candidates def build_inspire_index( bboxes: np.ndarray, offsets: np.ndarray, coords_mmap: np.memmap, cell_size: float = _GRID_CELL_SIZE, ) -> InspireIndex: """Build the grid spatial index used for per-OA candidate retrieval.""" return InspireIndex(bboxes, offsets, coords_mmap, cell_size)