perfect-postcode/pipeline/transform/postcode_boundaries/inspire.py
2026-06-02 13:46:18 +01:00

241 lines
9.5 KiB
Python

import zipfile
from pathlib import Path
from xml.etree.ElementTree import iterparse
import numpy as np
from shapely import make_valid
from shapely.geometry import Polygon
from tqdm import tqdm
_GML_NS = "{http://www.opengis.net/gml/3.2}"
_LR_NS = "{www.landregistry.gov.uk}"
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
results = []
with zipfile.ZipFile(zip_path) as zf:
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
if not gml_names:
return results
with zf.open(gml_names[0]) as f:
for event, elem in iterparse(f, events=("end",)):
if elem.tag != f"{_LR_NS}PREDEFINED":
continue
pos_list = elem.find(f".//{_GML_NS}posList")
if pos_list is not None and pos_list.text:
vals = pos_list.text.split()
n = len(vals) // 2
if n >= 3:
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
results.append(coords)
elem.clear()
return results
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
Processes ZIPs sequentially to keep memory under control (~2GB peak).
Each ZIP's polygons are streamed to disk immediately after parsing.
Writes three files:
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
- inspire_coords.bin: flat binary of all float64 coordinate pairs
"""
zip_files = sorted(inspire_dir.glob("*.zip"))
print(
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
)
cache_dir.mkdir(parents=True, exist_ok=True)
# Pre-allocate arrays for bboxes and offsets (grow if needed)
capacity = 25_000_000
bboxes = np.empty((capacity, 4), dtype=np.float64)
offsets = np.empty((capacity, 2), dtype=np.int64)
count = 0
byte_offset = 0
coords_path = cache_dir / "inspire_coords.bin"
with open(coords_path, "wb") as cf:
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
for coords in parse_inspire_zip(zip_path):
if count >= capacity:
capacity = int(capacity * 1.5)
bboxes.resize((capacity, 4), refcheck=False)
offsets.resize((capacity, 2), refcheck=False)
bboxes[count, 0] = coords[:, 0].min()
bboxes[count, 1] = coords[:, 1].min()
bboxes[count, 2] = coords[:, 0].max()
bboxes[count, 3] = coords[:, 1].max()
offsets[count, 0] = byte_offset
offsets[count, 1] = len(coords)
raw = coords.astype(np.float64).tobytes()
cf.write(raw)
byte_offset += len(raw)
count += 1
# Trim to actual size and save
bboxes = bboxes[:count]
offsets = offsets[:count]
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
np.save(cache_dir / "inspire_offsets.npy", offsets)
size_mb = byte_offset / (1024 * 1024)
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
def inspire_cache_exists(cache_dir: Path) -> bool:
return all(
(cache_dir / f).exists()
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
)
def load_inspire(
cache_dir: Path,
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
"""
print(f"Loading INSPIRE cache from {cache_dir}...")
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
offsets = np.load(cache_dir / "inspire_offsets.npy")
coords_mmap = np.memmap(
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
)
print(
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
)
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
return bboxes, offsets, coords_mmap
# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
# query.
_GRID_CELL_SIZE = 1000.0
class InspireIndex:
"""Uniform-grid spatial index over INSPIRE parcel bounding boxes.
The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
OA, ~4 h total over the country). This indexes parcels by grid cell so each
lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
by cell id, located with ``searchsorted``); the few parcels larger than a
cell are kept in an overflow array tested directly on every query. An exact
bbox test then runs on the gathered subset and the result is sorted, so the
candidate set -- and its order -- is byte-for-byte identical to the old scan.
"""
def __init__(
self,
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
cell_size: float = _GRID_CELL_SIZE,
) -> None:
self._bboxes = bboxes
self._offsets = offsets
self._coords = coords_mmap
self._cell_size = cell_size
self._origin_x = float(bboxes[:, 0].min())
self._origin_y = float(bboxes[:, 1].min())
# Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
# query's one-cell low-edge widening can never collide with cx-1.
self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
width = bboxes[:, 2] - bboxes[:, 0]
height = bboxes[:, 3] - bboxes[:, 1]
small = np.where((width <= cell_size) & (height <= cell_size))[0]
self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
self._oversized_bb = bboxes[self._oversized]
cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
cell_id = cx * self._ny + cy
order = np.argsort(cell_id, kind="stable")
self._sorted_cells = cell_id[order]
self._cell_parcels = small[order]
def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
"""Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
min_e, min_n, max_e, max_n = oa_bounds
cs = self._cell_size
# A small parcel (<= one cell) overlapping the OA has its min-corner no
# more than one cell below/left of the OA bbox, so widen the low edges by
# a cell. This keeps the lookup free of false negatives.
gx0 = int((min_e - cs - self._origin_x) // cs)
gx1 = int((max_e - self._origin_x) // cs)
gy_lo = int((min_n - cs - self._origin_y) // cs)
gy_hi = int((max_n - self._origin_y) // cs)
parts = []
ob = self._oversized_bb
if len(ob):
mo = (
(ob[:, 2] >= min_e)
& (ob[:, 0] <= max_e)
& (ob[:, 3] >= min_n)
& (ob[:, 1] <= max_n)
)
if mo.any():
parts.append(self._oversized[mo])
for gx in range(gx0, gx1 + 1):
base = gx * self._ny
lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
if hi > lo:
parts.append(self._cell_parcels[lo:hi])
if not parts:
return np.empty(0, dtype=np.int64)
cand = np.concatenate(parts)
cb = self._bboxes[cand]
mask = (
(cb[:, 2] >= min_e)
& (cb[:, 0] <= max_e)
& (cb[:, 3] >= min_n)
& (cb[:, 1] <= max_n)
)
# Sort so the candidate order matches the old full np.where scan exactly.
return np.sort(cand[mask])
def candidates(
self, oa_bounds: tuple[float, float, float, float]
) -> list[Polygon]:
"""INSPIRE polygons overlapping an OA, built from the mmap on demand.
Builds Shapely objects only for matches (typically 10-500 per OA).
"""
candidates = []
for i in self.candidate_indices(oa_bounds):
byte_offset = self._offsets[i, 0]
n_pts = self._offsets[i, 1]
float_offset = byte_offset // 8 # float64 = 8 bytes
coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
poly = Polygon(coords)
if not poly.is_valid:
poly = make_valid(poly)
if poly.geom_type == "MultiPolygon":
poly = max(poly.geoms, key=lambda g: g.area)
elif poly.geom_type != "Polygon":
continue
if not poly.is_empty:
candidates.append(poly)
return candidates
def build_inspire_index(
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
cell_size: float = _GRID_CELL_SIZE,
) -> InspireIndex:
"""Build the grid spatial index used for per-OA candidate retrieval."""
return InspireIndex(bboxes, offsets, coords_mmap, cell_size)