Add postcode boundary calculation
This commit is contained in:
parent
f9bd218a3e
commit
f5e6894c0f
14 changed files with 1384 additions and 717 deletions
155
pipeline/transform/postcode_boundaries/inspire.py
Normal file
155
pipeline/transform/postcode_boundaries/inspire.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
import zipfile
|
||||
from pathlib import Path
|
||||
from xml.etree.ElementTree import iterparse
|
||||
|
||||
import numpy as np
|
||||
from shapely import make_valid
|
||||
from shapely.geometry import Polygon
|
||||
from tqdm import tqdm
|
||||
|
||||
_GML_NS = "{http://www.opengis.net/gml/3.2}"
|
||||
_LR_NS = "{www.landregistry.gov.uk}"
|
||||
|
||||
|
||||
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
|
||||
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
|
||||
results = []
|
||||
with zipfile.ZipFile(zip_path) as zf:
|
||||
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
|
||||
if not gml_names:
|
||||
return results
|
||||
with zf.open(gml_names[0]) as f:
|
||||
for event, elem in iterparse(f, events=("end",)):
|
||||
if elem.tag != f"{_LR_NS}PREDEFINED":
|
||||
continue
|
||||
pos_list = elem.find(f".//{_GML_NS}posList")
|
||||
if pos_list is not None and pos_list.text:
|
||||
vals = pos_list.text.split()
|
||||
n = len(vals) // 2
|
||||
if n >= 3:
|
||||
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
|
||||
results.append(coords)
|
||||
elem.clear()
|
||||
return results
|
||||
|
||||
|
||||
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
|
||||
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
|
||||
|
||||
Processes ZIPs sequentially to keep memory under control (~2GB peak).
|
||||
Each ZIP's polygons are streamed to disk immediately after parsing.
|
||||
|
||||
Writes three files:
|
||||
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
|
||||
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
|
||||
- inspire_coords.bin: flat binary of all float64 coordinate pairs
|
||||
"""
|
||||
zip_files = sorted(inspire_dir.glob("*.zip"))
|
||||
print(
|
||||
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
|
||||
)
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Pre-allocate arrays for bboxes and offsets (grow if needed)
|
||||
capacity = 25_000_000
|
||||
bboxes = np.empty((capacity, 4), dtype=np.float64)
|
||||
offsets = np.empty((capacity, 2), dtype=np.int64)
|
||||
count = 0
|
||||
byte_offset = 0
|
||||
|
||||
coords_path = cache_dir / "inspire_coords.bin"
|
||||
with open(coords_path, "wb") as cf:
|
||||
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
|
||||
for coords in parse_inspire_zip(zip_path):
|
||||
if count >= capacity:
|
||||
capacity = int(capacity * 1.5)
|
||||
bboxes.resize((capacity, 4), refcheck=False)
|
||||
offsets.resize((capacity, 2), refcheck=False)
|
||||
bboxes[count, 0] = coords[:, 0].min()
|
||||
bboxes[count, 1] = coords[:, 1].min()
|
||||
bboxes[count, 2] = coords[:, 0].max()
|
||||
bboxes[count, 3] = coords[:, 1].max()
|
||||
offsets[count, 0] = byte_offset
|
||||
offsets[count, 1] = len(coords)
|
||||
raw = coords.astype(np.float64).tobytes()
|
||||
cf.write(raw)
|
||||
byte_offset += len(raw)
|
||||
count += 1
|
||||
|
||||
# Trim to actual size and save
|
||||
bboxes = bboxes[:count]
|
||||
offsets = offsets[:count]
|
||||
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
|
||||
np.save(cache_dir / "inspire_offsets.npy", offsets)
|
||||
size_mb = byte_offset / (1024 * 1024)
|
||||
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
|
||||
|
||||
|
||||
def inspire_cache_exists(cache_dir: Path) -> bool:
|
||||
return all(
|
||||
(cache_dir / f).exists()
|
||||
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
|
||||
)
|
||||
|
||||
|
||||
def load_inspire(
|
||||
cache_dir: Path,
|
||||
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
|
||||
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
|
||||
|
||||
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
|
||||
"""
|
||||
print(f"Loading INSPIRE cache from {cache_dir}...")
|
||||
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
|
||||
offsets = np.load(cache_dir / "inspire_offsets.npy")
|
||||
coords_mmap = np.memmap(
|
||||
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
|
||||
)
|
||||
print(
|
||||
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
|
||||
)
|
||||
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
|
||||
return bboxes, offsets, coords_mmap
|
||||
|
||||
|
||||
def get_inspire_candidates(
|
||||
oa_bounds: tuple[float, float, float, float],
|
||||
bboxes: np.ndarray,
|
||||
offsets: np.ndarray,
|
||||
coords_mmap: np.memmap,
|
||||
) -> list[Polygon]:
|
||||
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
|
||||
|
||||
Builds Shapely objects only for matches (typically 10-500 per OA).
|
||||
Reads coordinate data on-demand from memory-mapped file.
|
||||
"""
|
||||
min_e, min_n, max_e, max_n = oa_bounds
|
||||
|
||||
# Vectorized bbox overlap test
|
||||
mask = (
|
||||
(bboxes[:, 2] >= min_e)
|
||||
& (bboxes[:, 0] <= max_e)
|
||||
& (bboxes[:, 3] >= min_n)
|
||||
& (bboxes[:, 1] <= max_n)
|
||||
)
|
||||
idxs = np.where(mask)[0]
|
||||
if len(idxs) == 0:
|
||||
return []
|
||||
|
||||
# Build Shapely polygons only for candidates (coords from mmap)
|
||||
candidates = []
|
||||
for i in idxs:
|
||||
byte_offset = offsets[i, 0]
|
||||
n_pts = offsets[i, 1]
|
||||
float_offset = byte_offset // 8 # float64 = 8 bytes
|
||||
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
||||
poly = Polygon(coords)
|
||||
if not poly.is_valid:
|
||||
poly = make_valid(poly)
|
||||
if poly.geom_type == "MultiPolygon":
|
||||
poly = max(poly.geoms, key=lambda g: g.area)
|
||||
elif poly.geom_type != "Polygon":
|
||||
continue
|
||||
if not poly.is_empty:
|
||||
candidates.append(poly)
|
||||
return candidates
|
||||
Loading…
Add table
Add a link
Reference in a new issue