155 lines
5.6 KiB
Python
155 lines
5.6 KiB
Python
import zipfile
|
|
from pathlib import Path
|
|
from xml.etree.ElementTree import iterparse
|
|
|
|
import numpy as np
|
|
from shapely import make_valid
|
|
from shapely.geometry import Polygon
|
|
from tqdm import tqdm
|
|
|
|
_GML_NS = "{http://www.opengis.net/gml/3.2}"
|
|
_LR_NS = "{www.landregistry.gov.uk}"
|
|
|
|
|
|
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
|
|
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
|
|
results = []
|
|
with zipfile.ZipFile(zip_path) as zf:
|
|
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
|
|
if not gml_names:
|
|
return results
|
|
with zf.open(gml_names[0]) as f:
|
|
for event, elem in iterparse(f, events=("end",)):
|
|
if elem.tag != f"{_LR_NS}PREDEFINED":
|
|
continue
|
|
pos_list = elem.find(f".//{_GML_NS}posList")
|
|
if pos_list is not None and pos_list.text:
|
|
vals = pos_list.text.split()
|
|
n = len(vals) // 2
|
|
if n >= 3:
|
|
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
|
|
results.append(coords)
|
|
elem.clear()
|
|
return results
|
|
|
|
|
|
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
|
|
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
|
|
|
|
Processes ZIPs sequentially to keep memory under control (~2GB peak).
|
|
Each ZIP's polygons are streamed to disk immediately after parsing.
|
|
|
|
Writes three files:
|
|
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
|
|
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
|
|
- inspire_coords.bin: flat binary of all float64 coordinate pairs
|
|
"""
|
|
zip_files = sorted(inspire_dir.glob("*.zip"))
|
|
print(
|
|
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
|
|
)
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Pre-allocate arrays for bboxes and offsets (grow if needed)
|
|
capacity = 25_000_000
|
|
bboxes = np.empty((capacity, 4), dtype=np.float64)
|
|
offsets = np.empty((capacity, 2), dtype=np.int64)
|
|
count = 0
|
|
byte_offset = 0
|
|
|
|
coords_path = cache_dir / "inspire_coords.bin"
|
|
with open(coords_path, "wb") as cf:
|
|
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
|
|
for coords in parse_inspire_zip(zip_path):
|
|
if count >= capacity:
|
|
capacity = int(capacity * 1.5)
|
|
bboxes.resize((capacity, 4), refcheck=False)
|
|
offsets.resize((capacity, 2), refcheck=False)
|
|
bboxes[count, 0] = coords[:, 0].min()
|
|
bboxes[count, 1] = coords[:, 1].min()
|
|
bboxes[count, 2] = coords[:, 0].max()
|
|
bboxes[count, 3] = coords[:, 1].max()
|
|
offsets[count, 0] = byte_offset
|
|
offsets[count, 1] = len(coords)
|
|
raw = coords.astype(np.float64).tobytes()
|
|
cf.write(raw)
|
|
byte_offset += len(raw)
|
|
count += 1
|
|
|
|
# Trim to actual size and save
|
|
bboxes = bboxes[:count]
|
|
offsets = offsets[:count]
|
|
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
|
|
np.save(cache_dir / "inspire_offsets.npy", offsets)
|
|
size_mb = byte_offset / (1024 * 1024)
|
|
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
|
|
|
|
|
|
def inspire_cache_exists(cache_dir: Path) -> bool:
|
|
return all(
|
|
(cache_dir / f).exists()
|
|
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
|
|
)
|
|
|
|
|
|
def load_inspire(
|
|
cache_dir: Path,
|
|
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
|
|
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
|
|
|
|
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
|
|
"""
|
|
print(f"Loading INSPIRE cache from {cache_dir}...")
|
|
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
|
|
offsets = np.load(cache_dir / "inspire_offsets.npy")
|
|
coords_mmap = np.memmap(
|
|
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
|
|
)
|
|
print(
|
|
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
|
|
)
|
|
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
|
|
return bboxes, offsets, coords_mmap
|
|
|
|
|
|
def get_inspire_candidates(
|
|
oa_bounds: tuple[float, float, float, float],
|
|
bboxes: np.ndarray,
|
|
offsets: np.ndarray,
|
|
coords_mmap: np.memmap,
|
|
) -> list[Polygon]:
|
|
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
|
|
|
|
Builds Shapely objects only for matches (typically 10-500 per OA).
|
|
Reads coordinate data on-demand from memory-mapped file.
|
|
"""
|
|
min_e, min_n, max_e, max_n = oa_bounds
|
|
|
|
# Vectorized bbox overlap test
|
|
mask = (
|
|
(bboxes[:, 2] >= min_e)
|
|
& (bboxes[:, 0] <= max_e)
|
|
& (bboxes[:, 3] >= min_n)
|
|
& (bboxes[:, 1] <= max_n)
|
|
)
|
|
idxs = np.where(mask)[0]
|
|
if len(idxs) == 0:
|
|
return []
|
|
|
|
# Build Shapely polygons only for candidates (coords from mmap)
|
|
candidates = []
|
|
for i in idxs:
|
|
byte_offset = offsets[i, 0]
|
|
n_pts = offsets[i, 1]
|
|
float_offset = byte_offset // 8 # float64 = 8 bytes
|
|
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
|
poly = Polygon(coords)
|
|
if not poly.is_valid:
|
|
poly = make_valid(poly)
|
|
if poly.geom_type == "MultiPolygon":
|
|
poly = max(poly.geoms, key=lambda g: g.area)
|
|
elif poly.geom_type != "Polygon":
|
|
continue
|
|
if not poly.is_empty:
|
|
candidates.append(poly)
|
|
return candidates
|