241 lines
9.5 KiB
Python
241 lines
9.5 KiB
Python
import zipfile
|
|
from pathlib import Path
|
|
from xml.etree.ElementTree import iterparse
|
|
|
|
import numpy as np
|
|
from shapely import make_valid
|
|
from shapely.geometry import Polygon
|
|
from tqdm import tqdm
|
|
|
|
_GML_NS = "{http://www.opengis.net/gml/3.2}"
|
|
_LR_NS = "{www.landregistry.gov.uk}"
|
|
|
|
|
|
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
|
|
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
|
|
results = []
|
|
with zipfile.ZipFile(zip_path) as zf:
|
|
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
|
|
if not gml_names:
|
|
return results
|
|
with zf.open(gml_names[0]) as f:
|
|
for event, elem in iterparse(f, events=("end",)):
|
|
if elem.tag != f"{_LR_NS}PREDEFINED":
|
|
continue
|
|
pos_list = elem.find(f".//{_GML_NS}posList")
|
|
if pos_list is not None and pos_list.text:
|
|
vals = pos_list.text.split()
|
|
n = len(vals) // 2
|
|
if n >= 3:
|
|
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
|
|
results.append(coords)
|
|
elem.clear()
|
|
return results
|
|
|
|
|
|
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
|
|
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
|
|
|
|
Processes ZIPs sequentially to keep memory under control (~2GB peak).
|
|
Each ZIP's polygons are streamed to disk immediately after parsing.
|
|
|
|
Writes three files:
|
|
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
|
|
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
|
|
- inspire_coords.bin: flat binary of all float64 coordinate pairs
|
|
"""
|
|
zip_files = sorted(inspire_dir.glob("*.zip"))
|
|
print(
|
|
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
|
|
)
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Pre-allocate arrays for bboxes and offsets (grow if needed)
|
|
capacity = 25_000_000
|
|
bboxes = np.empty((capacity, 4), dtype=np.float64)
|
|
offsets = np.empty((capacity, 2), dtype=np.int64)
|
|
count = 0
|
|
byte_offset = 0
|
|
|
|
coords_path = cache_dir / "inspire_coords.bin"
|
|
with open(coords_path, "wb") as cf:
|
|
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
|
|
for coords in parse_inspire_zip(zip_path):
|
|
if count >= capacity:
|
|
capacity = int(capacity * 1.5)
|
|
bboxes.resize((capacity, 4), refcheck=False)
|
|
offsets.resize((capacity, 2), refcheck=False)
|
|
bboxes[count, 0] = coords[:, 0].min()
|
|
bboxes[count, 1] = coords[:, 1].min()
|
|
bboxes[count, 2] = coords[:, 0].max()
|
|
bboxes[count, 3] = coords[:, 1].max()
|
|
offsets[count, 0] = byte_offset
|
|
offsets[count, 1] = len(coords)
|
|
raw = coords.astype(np.float64).tobytes()
|
|
cf.write(raw)
|
|
byte_offset += len(raw)
|
|
count += 1
|
|
|
|
# Trim to actual size and save
|
|
bboxes = bboxes[:count]
|
|
offsets = offsets[:count]
|
|
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
|
|
np.save(cache_dir / "inspire_offsets.npy", offsets)
|
|
size_mb = byte_offset / (1024 * 1024)
|
|
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
|
|
|
|
|
|
def inspire_cache_exists(cache_dir: Path) -> bool:
|
|
return all(
|
|
(cache_dir / f).exists()
|
|
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
|
|
)
|
|
|
|
|
|
def load_inspire(
|
|
cache_dir: Path,
|
|
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
|
|
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
|
|
|
|
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
|
|
"""
|
|
print(f"Loading INSPIRE cache from {cache_dir}...")
|
|
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
|
|
offsets = np.load(cache_dir / "inspire_offsets.npy")
|
|
coords_mmap = np.memmap(
|
|
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
|
|
)
|
|
print(
|
|
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
|
|
)
|
|
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
|
|
return bboxes, offsets, coords_mmap
|
|
|
|
|
|
# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
|
|
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
|
|
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
|
|
# query.
|
|
_GRID_CELL_SIZE = 1000.0
|
|
|
|
|
|
class InspireIndex:
|
|
"""Uniform-grid spatial index over INSPIRE parcel bounding boxes.
|
|
|
|
The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
|
|
OA, ~4 h total over the country). This indexes parcels by grid cell so each
|
|
lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
|
|
bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
|
|
by cell id, located with ``searchsorted``); the few parcels larger than a
|
|
cell are kept in an overflow array tested directly on every query. An exact
|
|
bbox test then runs on the gathered subset and the result is sorted, so the
|
|
candidate set -- and its order -- is byte-for-byte identical to the old scan.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
bboxes: np.ndarray,
|
|
offsets: np.ndarray,
|
|
coords_mmap: np.memmap,
|
|
cell_size: float = _GRID_CELL_SIZE,
|
|
) -> None:
|
|
self._bboxes = bboxes
|
|
self._offsets = offsets
|
|
self._coords = coords_mmap
|
|
self._cell_size = cell_size
|
|
self._origin_x = float(bboxes[:, 0].min())
|
|
self._origin_y = float(bboxes[:, 1].min())
|
|
# Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
|
|
# query's one-cell low-edge widening can never collide with cx-1.
|
|
self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
|
|
|
|
width = bboxes[:, 2] - bboxes[:, 0]
|
|
height = bboxes[:, 3] - bboxes[:, 1]
|
|
small = np.where((width <= cell_size) & (height <= cell_size))[0]
|
|
self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
|
|
self._oversized_bb = bboxes[self._oversized]
|
|
|
|
cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
|
|
cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
|
|
cell_id = cx * self._ny + cy
|
|
order = np.argsort(cell_id, kind="stable")
|
|
self._sorted_cells = cell_id[order]
|
|
self._cell_parcels = small[order]
|
|
|
|
def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
|
|
"""Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
|
|
min_e, min_n, max_e, max_n = oa_bounds
|
|
cs = self._cell_size
|
|
# A small parcel (<= one cell) overlapping the OA has its min-corner no
|
|
# more than one cell below/left of the OA bbox, so widen the low edges by
|
|
# a cell. This keeps the lookup free of false negatives.
|
|
gx0 = int((min_e - cs - self._origin_x) // cs)
|
|
gx1 = int((max_e - self._origin_x) // cs)
|
|
gy_lo = int((min_n - cs - self._origin_y) // cs)
|
|
gy_hi = int((max_n - self._origin_y) // cs)
|
|
|
|
parts = []
|
|
ob = self._oversized_bb
|
|
if len(ob):
|
|
mo = (
|
|
(ob[:, 2] >= min_e)
|
|
& (ob[:, 0] <= max_e)
|
|
& (ob[:, 3] >= min_n)
|
|
& (ob[:, 1] <= max_n)
|
|
)
|
|
if mo.any():
|
|
parts.append(self._oversized[mo])
|
|
|
|
for gx in range(gx0, gx1 + 1):
|
|
base = gx * self._ny
|
|
lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
|
|
hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
|
|
if hi > lo:
|
|
parts.append(self._cell_parcels[lo:hi])
|
|
|
|
if not parts:
|
|
return np.empty(0, dtype=np.int64)
|
|
cand = np.concatenate(parts)
|
|
cb = self._bboxes[cand]
|
|
mask = (
|
|
(cb[:, 2] >= min_e)
|
|
& (cb[:, 0] <= max_e)
|
|
& (cb[:, 3] >= min_n)
|
|
& (cb[:, 1] <= max_n)
|
|
)
|
|
# Sort so the candidate order matches the old full np.where scan exactly.
|
|
return np.sort(cand[mask])
|
|
|
|
def candidates(
|
|
self, oa_bounds: tuple[float, float, float, float]
|
|
) -> list[Polygon]:
|
|
"""INSPIRE polygons overlapping an OA, built from the mmap on demand.
|
|
|
|
Builds Shapely objects only for matches (typically 10-500 per OA).
|
|
"""
|
|
candidates = []
|
|
for i in self.candidate_indices(oa_bounds):
|
|
byte_offset = self._offsets[i, 0]
|
|
n_pts = self._offsets[i, 1]
|
|
float_offset = byte_offset // 8 # float64 = 8 bytes
|
|
coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
|
poly = Polygon(coords)
|
|
if not poly.is_valid:
|
|
poly = make_valid(poly)
|
|
if poly.geom_type == "MultiPolygon":
|
|
poly = max(poly.geoms, key=lambda g: g.area)
|
|
elif poly.geom_type != "Polygon":
|
|
continue
|
|
if not poly.is_empty:
|
|
candidates.append(poly)
|
|
return candidates
|
|
|
|
|
|
def build_inspire_index(
|
|
bboxes: np.ndarray,
|
|
offsets: np.ndarray,
|
|
coords_mmap: np.memmap,
|
|
cell_size: float = _GRID_CELL_SIZE,
|
|
) -> InspireIndex:
|
|
"""Build the grid spatial index used for per-OA candidate retrieval."""
|
|
return InspireIndex(bboxes, offsets, coords_mmap, cell_size)
|