import zipfile from pathlib import Path from xml.etree.ElementTree import iterparse import numpy as np from shapely import make_valid from shapely.geometry import Polygon from tqdm import tqdm _GML_NS = "{http://www.opengis.net/gml/3.2}" _LR_NS = "{www.landregistry.gov.uk}" def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]: """Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing).""" results = [] with zipfile.ZipFile(zip_path) as zf: gml_names = [n for n in zf.namelist() if n.endswith(".gml")] if not gml_names: return results with zf.open(gml_names[0]) as f: for event, elem in iterparse(f, events=("end",)): if elem.tag != f"{_LR_NS}PREDEFINED": continue pos_list = elem.find(f".//{_GML_NS}posList") if pos_list is not None and pos_list.text: vals = pos_list.text.split() n = len(vals) // 2 if n >= 3: coords = np.array(vals, dtype=np.float64).reshape(n, 2) results.append(coords) elem.clear() return results def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None: """Parse all INSPIRE ZIPs and cache as memory-mappable binary files. Processes ZIPs sequentially to keep memory under control (~2GB peak). Each ZIP's polygons are streamed to disk immediately after parsing. Writes three files: - inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n] - inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points] - inspire_coords.bin: flat binary of all float64 coordinate pairs """ zip_files = sorted(inspire_dir.glob("*.zip")) print( f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..." ) cache_dir.mkdir(parents=True, exist_ok=True) # Pre-allocate arrays for bboxes and offsets (grow if needed) capacity = 25_000_000 bboxes = np.empty((capacity, 4), dtype=np.float64) offsets = np.empty((capacity, 2), dtype=np.int64) count = 0 byte_offset = 0 coords_path = cache_dir / "inspire_coords.bin" with open(coords_path, "wb") as cf: for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"): for coords in parse_inspire_zip(zip_path): if count >= capacity: capacity = int(capacity * 1.5) bboxes.resize((capacity, 4), refcheck=False) offsets.resize((capacity, 2), refcheck=False) bboxes[count, 0] = coords[:, 0].min() bboxes[count, 1] = coords[:, 1].min() bboxes[count, 2] = coords[:, 0].max() bboxes[count, 3] = coords[:, 1].max() offsets[count, 0] = byte_offset offsets[count, 1] = len(coords) raw = coords.astype(np.float64).tobytes() cf.write(raw) byte_offset += len(raw) count += 1 # Trim to actual size and save bboxes = bboxes[:count] offsets = offsets[:count] np.save(cache_dir / "inspire_bboxes.npy", bboxes) np.save(cache_dir / "inspire_offsets.npy", offsets) size_mb = byte_offset / (1024 * 1024) print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)") def inspire_cache_exists(cache_dir: Path) -> bool: return all( (cache_dir / f).exists() for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin") ) def load_inspire( cache_dir: Path, ) -> tuple[np.ndarray, np.ndarray, np.memmap]: """Load INSPIRE cache → (bboxes, offsets, coords_mmap). Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped). """ print(f"Loading INSPIRE cache from {cache_dir}...") bboxes = np.load(cache_dir / "inspire_bboxes.npy") offsets = np.load(cache_dir / "inspire_offsets.npy") coords_mmap = np.memmap( cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r" ) print( f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)" ) print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped") return bboxes, offsets, coords_mmap def get_inspire_candidates( oa_bounds: tuple[float, float, float, float], bboxes: np.ndarray, offsets: np.ndarray, coords_mmap: np.memmap, ) -> list[Polygon]: """Get INSPIRE polygons overlapping an OA via bbox pre-filter. Builds Shapely objects only for matches (typically 10-500 per OA). Reads coordinate data on-demand from memory-mapped file. """ min_e, min_n, max_e, max_n = oa_bounds # Vectorized bbox overlap test mask = ( (bboxes[:, 2] >= min_e) & (bboxes[:, 0] <= max_e) & (bboxes[:, 3] >= min_n) & (bboxes[:, 1] <= max_n) ) idxs = np.where(mask)[0] if len(idxs) == 0: return [] # Build Shapely polygons only for candidates (coords from mmap) candidates = [] for i in idxs: byte_offset = offsets[i, 0] n_pts = offsets[i, 1] float_offset = byte_offset // 8 # float64 = 8 bytes coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2) poly = Polygon(coords) if not poly.is_valid: poly = make_valid(poly) if poly.geom_type == "MultiPolygon": poly = max(poly.geoms, key=lambda g: g.area) elif poly.geom_type != "Polygon": continue if not poly.is_empty: candidates.append(poly) return candidates