perfect-postcode/pipeline/transform/postcode_boundaries/inspire.py

155 lines
5.6 KiB
Python

import zipfile
from pathlib import Path
from xml.etree.ElementTree import iterparse
import numpy as np
from shapely import make_valid
from shapely.geometry import Polygon
from tqdm import tqdm
_GML_NS = "{http://www.opengis.net/gml/3.2}"
_LR_NS = "{www.landregistry.gov.uk}"
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
results = []
with zipfile.ZipFile(zip_path) as zf:
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
if not gml_names:
return results
with zf.open(gml_names[0]) as f:
for event, elem in iterparse(f, events=("end",)):
if elem.tag != f"{_LR_NS}PREDEFINED":
continue
pos_list = elem.find(f".//{_GML_NS}posList")
if pos_list is not None and pos_list.text:
vals = pos_list.text.split()
n = len(vals) // 2
if n >= 3:
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
results.append(coords)
elem.clear()
return results
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
Processes ZIPs sequentially to keep memory under control (~2GB peak).
Each ZIP's polygons are streamed to disk immediately after parsing.
Writes three files:
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
- inspire_coords.bin: flat binary of all float64 coordinate pairs
"""
zip_files = sorted(inspire_dir.glob("*.zip"))
print(
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
)
cache_dir.mkdir(parents=True, exist_ok=True)
# Pre-allocate arrays for bboxes and offsets (grow if needed)
capacity = 25_000_000
bboxes = np.empty((capacity, 4), dtype=np.float64)
offsets = np.empty((capacity, 2), dtype=np.int64)
count = 0
byte_offset = 0
coords_path = cache_dir / "inspire_coords.bin"
with open(coords_path, "wb") as cf:
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
for coords in parse_inspire_zip(zip_path):
if count >= capacity:
capacity = int(capacity * 1.5)
bboxes.resize((capacity, 4), refcheck=False)
offsets.resize((capacity, 2), refcheck=False)
bboxes[count, 0] = coords[:, 0].min()
bboxes[count, 1] = coords[:, 1].min()
bboxes[count, 2] = coords[:, 0].max()
bboxes[count, 3] = coords[:, 1].max()
offsets[count, 0] = byte_offset
offsets[count, 1] = len(coords)
raw = coords.astype(np.float64).tobytes()
cf.write(raw)
byte_offset += len(raw)
count += 1
# Trim to actual size and save
bboxes = bboxes[:count]
offsets = offsets[:count]
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
np.save(cache_dir / "inspire_offsets.npy", offsets)
size_mb = byte_offset / (1024 * 1024)
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
def inspire_cache_exists(cache_dir: Path) -> bool:
return all(
(cache_dir / f).exists()
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
)
def load_inspire(
cache_dir: Path,
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
"""
print(f"Loading INSPIRE cache from {cache_dir}...")
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
offsets = np.load(cache_dir / "inspire_offsets.npy")
coords_mmap = np.memmap(
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
)
print(
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
)
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
return bboxes, offsets, coords_mmap
def get_inspire_candidates(
oa_bounds: tuple[float, float, float, float],
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
) -> list[Polygon]:
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
Builds Shapely objects only for matches (typically 10-500 per OA).
Reads coordinate data on-demand from memory-mapped file.
"""
min_e, min_n, max_e, max_n = oa_bounds
# Vectorized bbox overlap test
mask = (
(bboxes[:, 2] >= min_e)
& (bboxes[:, 0] <= max_e)
& (bboxes[:, 3] >= min_n)
& (bboxes[:, 1] <= max_n)
)
idxs = np.where(mask)[0]
if len(idxs) == 0:
return []
# Build Shapely polygons only for candidates (coords from mmap)
candidates = []
for i in idxs:
byte_offset = offsets[i, 0]
n_pts = offsets[i, 1]
float_offset = byte_offset // 8 # float64 = 8 bytes
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
poly = Polygon(coords)
if not poly.is_valid:
poly = make_valid(poly)
if poly.geom_type == "MultiPolygon":
poly = max(poly.geoms, key=lambda g: g.area)
elif poly.geom_type != "Polygon":
continue
if not poly.is_empty:
candidates.append(poly)
return candidates