Add postcode mapping
This commit is contained in:
parent
e7f2d1ffc3
commit
4506263e5b
5 changed files with 966 additions and 0 deletions
715
pipeline/transform/postcode_boundaries.py
Normal file
715
pipeline/transform/postcode_boundaries.py
Normal file
|
|
@ -0,0 +1,715 @@
|
|||
"""Generate postcode boundary polygons from OA boundaries, INSPIRE parcels, and UPRN data.
|
||||
|
||||
Produces per-district GeoJSON files compatible with the Rust server's postcode loader.
|
||||
Each postcode gets a polygon (or MultiPolygon) guaranteed to be contained within its
|
||||
Output Area(s), with 100% OA coverage and no overlaps between postcodes within an OA.
|
||||
|
||||
Algorithm per OA:
|
||||
1. Single-postcode OA → entire OA polygon assigned to that postcode
|
||||
2. Multi-postcode OA:
|
||||
a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote
|
||||
b. Union INSPIRE parcels per postcode, clip to OA → "claimed" area
|
||||
c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points
|
||||
d. Final polygon = claimed + Voronoi share
|
||||
|
||||
Memory-efficient design (<12GB total):
|
||||
- INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built
|
||||
lazily per-OA via numpy bbox pre-filter (~100-500 candidates at a time)
|
||||
- UPRNs kept as sorted polars DataFrame with offset dict (Arrow storage, ~1.2GB)
|
||||
- OA processing runs sequentially (no multiprocess INSPIRE duplication)
|
||||
|
||||
Output format: {output}/units/{DISTRICT}.geojson with properties.postcodes and
|
||||
properties.mapit_code fields matching server-rs/src/data/postcodes.rs expectations.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import ctypes
|
||||
import gc
|
||||
import json
|
||||
import sqlite3
|
||||
import zipfile
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
from xml.etree.ElementTree import iterparse
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
from scipy.spatial import Voronoi
|
||||
from shapely import STRtree, make_valid, wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.ops import unary_union
|
||||
from tqdm import tqdm
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GeoPackage helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _release_memory() -> None:
|
||||
"""Force Python + glibc to release freed memory back to the OS."""
|
||||
gc.collect()
|
||||
try:
|
||||
ctypes.CDLL("libc.so.6").malloc_trim(0)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
_ENVELOPE_SIZES = {0: 0, 1: 32, 2: 48, 3: 48, 4: 64}
|
||||
|
||||
|
||||
def parse_gpkg_geometry(blob: bytes):
|
||||
"""Extract a Shapely geometry from a GeoPackage binary blob."""
|
||||
flags = blob[3]
|
||||
envelope_type = (flags >> 1) & 0x07
|
||||
header_size = 8 + _ENVELOPE_SIZES[envelope_type]
|
||||
return wkb.loads(blob[header_size:])
|
||||
|
||||
|
||||
def load_oa_boundaries(gpkg_path: Path) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Load OA boundary polygons from a GeoPackage. Geometry is already in BNG."""
|
||||
print("Loading OA boundaries...")
|
||||
|
||||
conn = sqlite3.connect(str(gpkg_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT OA21CD, SHAPE FROM OA_2021_EW_BGC_V2")
|
||||
|
||||
oa_geoms: dict[str, Polygon | MultiPolygon] = {}
|
||||
for oa_code, blob in cur:
|
||||
geom = parse_gpkg_geometry(bytes(blob))
|
||||
if geom.geom_type == "MultiPolygon" and len(geom.geoms) == 1:
|
||||
geom = geom.geoms[0]
|
||||
oa_geoms[oa_code] = geom
|
||||
|
||||
conn.close()
|
||||
print(f" Loaded {len(oa_geoms)} OA boundaries")
|
||||
return oa_geoms
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UPRN loading (memory-efficient: sorted polars DataFrame + offset dict)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
||||
|
||||
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
||||
Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
print("Loading UPRN lookup...")
|
||||
|
||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
(
|
||||
pl.scan_parquet(uprn_path)
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
.filter(~pl.col("OA21CD").str.starts_with("S"))
|
||||
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
||||
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
||||
.with_columns(pl.col("PCDS").str.strip_chars())
|
||||
.sort("OA21CD")
|
||||
.sink_parquet(tmp_path)
|
||||
)
|
||||
_release_memory()
|
||||
|
||||
# Read the sorted data — only one copy in memory (~2GB)
|
||||
df = pl.read_parquet(tmp_path)
|
||||
tmp_path.unlink()
|
||||
n = len(df)
|
||||
print(f" Loaded {n:,} UPRNs (England & Wales)")
|
||||
|
||||
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
||||
boundary_df = (
|
||||
df.lazy()
|
||||
.with_row_index("_i")
|
||||
.filter(pl.col("OA21CD") != pl.col("OA21CD").shift(1))
|
||||
.select("_i", "OA21CD")
|
||||
.collect()
|
||||
)
|
||||
starts_list = boundary_df["_i"].to_list()
|
||||
oa_list = boundary_df["OA21CD"].to_list()
|
||||
del boundary_df
|
||||
offsets: dict[str, tuple[int, int]] = {}
|
||||
for j in range(len(starts_list)):
|
||||
end = starts_list[j + 1] if j + 1 < len(starts_list) else n
|
||||
offsets[oa_list[j]] = (starts_list[j], end)
|
||||
del starts_list, oa_list
|
||||
|
||||
# Drop OA column (no longer needed) to save ~400MB
|
||||
df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
|
||||
_release_memory()
|
||||
|
||||
print(f" Grouped into {len(offsets)} OAs")
|
||||
return df, offsets
|
||||
|
||||
|
||||
def get_oa_uprns(
|
||||
df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
|
||||
) -> tuple[np.ndarray, list[str]]:
|
||||
"""Get UPRN coordinates and postcodes for a single OA.
|
||||
|
||||
Returns (points_nx2, postcodes_list).
|
||||
"""
|
||||
s, e = offsets[oa_code]
|
||||
sub = df[s:e]
|
||||
points = np.column_stack(
|
||||
[
|
||||
sub["GRIDGB1E"].to_numpy(),
|
||||
sub["GRIDGB1N"].to_numpy(),
|
||||
]
|
||||
)
|
||||
postcodes = sub["PCDS"].to_list()
|
||||
return points, postcodes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# INSPIRE GML parsing and caching
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_GML_NS = "{http://www.opengis.net/gml/3.2}"
|
||||
_LR_NS = "{www.landregistry.gov.uk}"
|
||||
|
||||
|
||||
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
|
||||
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
|
||||
results = []
|
||||
with zipfile.ZipFile(zip_path) as zf:
|
||||
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
|
||||
if not gml_names:
|
||||
return results
|
||||
with zf.open(gml_names[0]) as f:
|
||||
for event, elem in iterparse(f, events=("end",)):
|
||||
if elem.tag != f"{_LR_NS}PREDEFINED":
|
||||
continue
|
||||
pos_list = elem.find(f".//{_GML_NS}posList")
|
||||
if pos_list is not None and pos_list.text:
|
||||
vals = pos_list.text.split()
|
||||
n = len(vals) // 2
|
||||
if n >= 3:
|
||||
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
|
||||
results.append(coords)
|
||||
elem.clear()
|
||||
return results
|
||||
|
||||
|
||||
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
|
||||
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
|
||||
|
||||
Processes ZIPs sequentially to keep memory under control (~2GB peak).
|
||||
Each ZIP's polygons are streamed to disk immediately after parsing.
|
||||
|
||||
Writes three files:
|
||||
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
|
||||
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
|
||||
- inspire_coords.bin: flat binary of all float64 coordinate pairs
|
||||
"""
|
||||
zip_files = sorted(inspire_dir.glob("*.zip"))
|
||||
print(
|
||||
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
|
||||
)
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Pre-allocate arrays for bboxes and offsets (grow if needed)
|
||||
capacity = 25_000_000
|
||||
bboxes = np.empty((capacity, 4), dtype=np.float64)
|
||||
offsets = np.empty((capacity, 2), dtype=np.int64)
|
||||
count = 0
|
||||
byte_offset = 0
|
||||
|
||||
coords_path = cache_dir / "inspire_coords.bin"
|
||||
with open(coords_path, "wb") as cf:
|
||||
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
|
||||
for coords in parse_inspire_zip(zip_path):
|
||||
if count >= capacity:
|
||||
capacity = int(capacity * 1.5)
|
||||
bboxes.resize((capacity, 4), refcheck=False)
|
||||
offsets.resize((capacity, 2), refcheck=False)
|
||||
bboxes[count, 0] = coords[:, 0].min()
|
||||
bboxes[count, 1] = coords[:, 1].min()
|
||||
bboxes[count, 2] = coords[:, 0].max()
|
||||
bboxes[count, 3] = coords[:, 1].max()
|
||||
offsets[count, 0] = byte_offset
|
||||
offsets[count, 1] = len(coords)
|
||||
raw = coords.astype(np.float64).tobytes()
|
||||
cf.write(raw)
|
||||
byte_offset += len(raw)
|
||||
count += 1
|
||||
|
||||
# Trim to actual size and save
|
||||
bboxes = bboxes[:count]
|
||||
offsets = offsets[:count]
|
||||
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
|
||||
np.save(cache_dir / "inspire_offsets.npy", offsets)
|
||||
size_mb = byte_offset / (1024 * 1024)
|
||||
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
|
||||
|
||||
|
||||
def _inspire_cache_exists(cache_dir: Path) -> bool:
|
||||
return all(
|
||||
(cache_dir / f).exists()
|
||||
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
|
||||
)
|
||||
|
||||
|
||||
def load_inspire(
|
||||
cache_dir: Path,
|
||||
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
|
||||
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
|
||||
|
||||
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
|
||||
"""
|
||||
print(f"Loading INSPIRE cache from {cache_dir}...")
|
||||
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
|
||||
offsets = np.load(cache_dir / "inspire_offsets.npy")
|
||||
coords_mmap = np.memmap(
|
||||
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
|
||||
)
|
||||
print(
|
||||
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
|
||||
)
|
||||
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
|
||||
return bboxes, offsets, coords_mmap
|
||||
|
||||
|
||||
def get_inspire_candidates(
|
||||
oa_bounds: tuple[float, float, float, float],
|
||||
bboxes: np.ndarray,
|
||||
offsets: np.ndarray,
|
||||
coords_mmap: np.memmap,
|
||||
) -> list[Polygon]:
|
||||
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
|
||||
|
||||
Builds Shapely objects only for matches (typically 10-500 per OA).
|
||||
Reads coordinate data on-demand from memory-mapped file.
|
||||
"""
|
||||
min_e, min_n, max_e, max_n = oa_bounds
|
||||
|
||||
# Vectorized bbox overlap test
|
||||
mask = (
|
||||
(bboxes[:, 2] >= min_e)
|
||||
& (bboxes[:, 0] <= max_e)
|
||||
& (bboxes[:, 3] >= min_n)
|
||||
& (bboxes[:, 1] <= max_n)
|
||||
)
|
||||
idxs = np.where(mask)[0]
|
||||
if len(idxs) == 0:
|
||||
return []
|
||||
|
||||
# Build Shapely polygons only for candidates (coords from mmap)
|
||||
candidates = []
|
||||
for i in idxs:
|
||||
byte_offset = offsets[i, 0]
|
||||
n_pts = offsets[i, 1]
|
||||
float_offset = byte_offset // 8 # float64 = 8 bytes
|
||||
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
||||
poly = Polygon(coords)
|
||||
if not poly.is_valid:
|
||||
poly = make_valid(poly)
|
||||
if poly.geom_type == "MultiPolygon":
|
||||
poly = max(poly.geoms, key=lambda g: g.area)
|
||||
elif poly.geom_type != "Polygon":
|
||||
continue
|
||||
if not poly.is_empty:
|
||||
candidates.append(poly)
|
||||
return candidates
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Voronoi computation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_voronoi_regions(
|
||||
points: np.ndarray, postcodes: list[str], boundary: Polygon | MultiPolygon
|
||||
) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Compute Voronoi regions for points, clipped to boundary, grouped by postcode."""
|
||||
if len(points) == 0:
|
||||
return {}
|
||||
if len(points) == 1:
|
||||
return {postcodes[0]: boundary}
|
||||
|
||||
# Deduplicate points per postcode (flats at same coords)
|
||||
seen: dict[tuple[float, float], str] = {}
|
||||
unique_pts = []
|
||||
unique_pcs = []
|
||||
for i in range(len(points)):
|
||||
key = (points[i, 0], points[i, 1])
|
||||
if key not in seen:
|
||||
seen[key] = postcodes[i]
|
||||
unique_pts.append(points[i])
|
||||
unique_pcs.append(postcodes[i])
|
||||
|
||||
if len(unique_pts) == 1:
|
||||
return {unique_pcs[0]: boundary}
|
||||
|
||||
pts = np.array(unique_pts)
|
||||
min_e, min_n = pts.min(axis=0)
|
||||
max_e, max_n = pts.max(axis=0)
|
||||
span = max(max_e - min_e, max_n - min_n, 100)
|
||||
|
||||
dummy = np.array(
|
||||
[
|
||||
[min_e - span * 10, min_n - span * 10],
|
||||
[max_e + span * 10, min_n - span * 10],
|
||||
[min_e - span * 10, max_n + span * 10],
|
||||
[max_e + span * 10, max_n + span * 10],
|
||||
]
|
||||
)
|
||||
all_points = np.vstack([pts, dummy])
|
||||
|
||||
try:
|
||||
vor = Voronoi(all_points)
|
||||
except Exception:
|
||||
return {unique_pcs[0]: boundary}
|
||||
|
||||
n_real = len(pts)
|
||||
pc_polys: dict[str, list[Polygon]] = defaultdict(list)
|
||||
|
||||
for i in range(n_real):
|
||||
region_idx = vor.point_region[i]
|
||||
region = vor.regions[region_idx]
|
||||
if -1 in region or len(region) < 3:
|
||||
continue
|
||||
vertices = vor.vertices[region]
|
||||
poly = Polygon(vertices)
|
||||
if not poly.is_valid:
|
||||
poly = make_valid(poly)
|
||||
clipped = poly.intersection(boundary)
|
||||
if not clipped.is_empty:
|
||||
pc_polys[unique_pcs[i]].append(clipped)
|
||||
|
||||
result = {}
|
||||
for pc, parts in pc_polys.items():
|
||||
merged = unary_union(parts)
|
||||
if not merged.is_empty:
|
||||
result[pc] = merged
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-OA processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_oa(
|
||||
oa_geom: Polygon | MultiPolygon,
|
||||
points: np.ndarray,
|
||||
postcodes: list[str],
|
||||
inspire_candidates: list[Polygon],
|
||||
) -> list[tuple[str, Polygon | MultiPolygon]]:
|
||||
"""Process a single OA → list of (postcode, geometry) fragments."""
|
||||
unique_pcs = set(postcodes)
|
||||
if len(unique_pcs) == 1:
|
||||
return [(next(iter(unique_pcs)), oa_geom)]
|
||||
|
||||
# Try INSPIRE-based assignment
|
||||
claimed: dict[str, Polygon | MultiPolygon] = {}
|
||||
|
||||
if inspire_candidates:
|
||||
cand_tree = STRtree(inspire_candidates)
|
||||
|
||||
from shapely import points as shp_points
|
||||
|
||||
uprn_pts = shp_points(points)
|
||||
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects")
|
||||
|
||||
# Majority vote per candidate polygon
|
||||
cand_postcodes: dict[int, list[str]] = defaultdict(list)
|
||||
for pi, ci in zip(pt_idx, cand_idx):
|
||||
cand_postcodes[ci].append(postcodes[pi])
|
||||
|
||||
pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list)
|
||||
for ci, pc_list in cand_postcodes.items():
|
||||
winner = Counter(pc_list).most_common(1)[0][0]
|
||||
pc_inspire_polys[winner].append(inspire_candidates[ci])
|
||||
|
||||
for pc, polys in pc_inspire_polys.items():
|
||||
merged = unary_union(polys)
|
||||
clipped = merged.intersection(oa_geom)
|
||||
if not clipped.is_empty:
|
||||
if not clipped.is_valid:
|
||||
clipped = make_valid(clipped)
|
||||
claimed[pc] = clipped
|
||||
|
||||
# Compute remaining area
|
||||
if claimed:
|
||||
all_claimed = unary_union(list(claimed.values()))
|
||||
if not all_claimed.is_valid:
|
||||
all_claimed = make_valid(all_claimed)
|
||||
remaining = oa_geom.difference(all_claimed)
|
||||
if not remaining.is_valid:
|
||||
remaining = make_valid(remaining)
|
||||
else:
|
||||
remaining = oa_geom
|
||||
|
||||
# Distribute remaining area via Voronoi
|
||||
if not remaining.is_empty and remaining.area > 0.01:
|
||||
voronoi_result = compute_voronoi_regions(points, postcodes, remaining)
|
||||
else:
|
||||
voronoi_result = {}
|
||||
|
||||
# Combine claimed + voronoi
|
||||
result: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in claimed.items():
|
||||
result[pc].append(geom)
|
||||
for pc, geom in voronoi_result.items():
|
||||
result[pc].append(geom)
|
||||
|
||||
fragments = []
|
||||
for pc, parts in result.items():
|
||||
merged = unary_union(parts)
|
||||
if not merged.is_empty:
|
||||
if not merged.is_valid:
|
||||
merged = make_valid(merged)
|
||||
fragments.append((pc, merged))
|
||||
|
||||
return fragments
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output: merge fragments and write GeoJSON
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_to_wgs84 = None
|
||||
|
||||
|
||||
def _get_to_wgs84():
|
||||
global _to_wgs84
|
||||
if _to_wgs84 is None:
|
||||
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
return _to_wgs84
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
|
||||
if geom.is_empty:
|
||||
return None
|
||||
|
||||
simplified = geom.simplify(tolerance, preserve_topology=True)
|
||||
if simplified.is_empty:
|
||||
return None
|
||||
|
||||
transformer = _get_to_wgs84()
|
||||
|
||||
def transform_ring(coords):
|
||||
xs, ys = zip(*coords)
|
||||
lons, lats = transformer.transform(list(xs), list(ys))
|
||||
return list(zip(lons, lats))
|
||||
|
||||
def transform_polygon(poly):
|
||||
exterior = transform_ring(poly.exterior.coords)
|
||||
holes = [transform_ring(h.coords) for h in poly.interiors]
|
||||
return [exterior] + holes
|
||||
|
||||
if simplified.geom_type == "Polygon":
|
||||
return {
|
||||
"type": "Polygon",
|
||||
"coordinates": transform_polygon(simplified),
|
||||
}
|
||||
elif simplified.geom_type == "MultiPolygon":
|
||||
return {
|
||||
"type": "MultiPolygon",
|
||||
"coordinates": [transform_polygon(p) for p in simplified.geoms],
|
||||
}
|
||||
elif simplified.geom_type == "GeometryCollection":
|
||||
polys = [
|
||||
g for g in simplified.geoms if g.geom_type in ("Polygon", "MultiPolygon")
|
||||
]
|
||||
if not polys:
|
||||
return None
|
||||
return to_wgs84_geojson(unary_union(polys), tolerance=0)
|
||||
return None
|
||||
|
||||
|
||||
def merge_fragments(
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
|
||||
) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Merge cross-OA fragments for postcodes spanning multiple OAs."""
|
||||
by_postcode: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in all_fragments:
|
||||
by_postcode[pc].append(geom)
|
||||
|
||||
merged = {}
|
||||
for pc, parts in by_postcode.items():
|
||||
combined = unary_union(parts)
|
||||
if combined.is_empty:
|
||||
continue
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = combined.buffer(1.0).buffer(-1.0)
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Postcodes are contiguous delivery routes — keep only the largest
|
||||
# polygon; small detached fragments are algorithm artifacts
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = max(combined.geoms, key=lambda g: g.area)
|
||||
merged[pc] = combined
|
||||
return merged
|
||||
|
||||
|
||||
def write_district_geojson(
|
||||
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
||||
) -> int:
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count."""
|
||||
units_dir = output_dir / "units"
|
||||
units_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
||||
for pc, geom in postcodes.items():
|
||||
parts = pc.split()
|
||||
district = parts[0] if parts else pc[:4]
|
||||
by_district[district].append((pc, geom))
|
||||
|
||||
file_count = 0
|
||||
for district, entries in tqdm(
|
||||
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
||||
):
|
||||
features = []
|
||||
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
||||
geojson_geom = to_wgs84_geojson(geom)
|
||||
if geojson_geom is None:
|
||||
continue
|
||||
mapit_code = pc.replace(" ", "")
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": geojson_geom,
|
||||
"properties": {
|
||||
"postcodes": pc,
|
||||
"mapit_code": mapit_code,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if not features:
|
||||
continue
|
||||
|
||||
collection = {"type": "FeatureCollection", "features": features}
|
||||
out_path = units_dir / f"{district}.geojson"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(collection, f, separators=(",", ":"))
|
||||
file_count += 1
|
||||
|
||||
return file_count
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main orchestration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
|
||||
)
|
||||
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
|
||||
parser.add_argument(
|
||||
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
|
||||
)
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output directory")
|
||||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Phase 1: Load all data
|
||||
print("=" * 60)
|
||||
print("Phase 1: Loading data")
|
||||
print("=" * 60)
|
||||
|
||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||
uprn_df, uprn_offsets = load_uprns(args.uprn)
|
||||
|
||||
# Phase 2: Parse/load INSPIRE
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 2: INSPIRE data")
|
||||
print("=" * 60)
|
||||
|
||||
inspire_cache_dir = args.output / "inspire_cache"
|
||||
if not _inspire_cache_exists(inspire_cache_dir):
|
||||
cache_inspire(args.inspire, inspire_cache_dir)
|
||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||
|
||||
# Phase 3: Process OAs
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 3: Processing OAs")
|
||||
print("=" * 60)
|
||||
|
||||
# Build work list — precompute which OAs are single vs multi-postcode
|
||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||
|
||||
if args.limit > 0:
|
||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||
|
||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||
single_count = 0
|
||||
multi_count = 0
|
||||
|
||||
for oa_code in tqdm(
|
||||
oa_codes_with_data,
|
||||
desc="Processing OAs",
|
||||
unit="OA",
|
||||
smoothing=0.01,
|
||||
miniters=100,
|
||||
):
|
||||
oa_geom = oa_geoms[oa_code]
|
||||
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
|
||||
|
||||
if len(set(postcodes)) == 1:
|
||||
# Fast path: entire OA = one postcode
|
||||
all_fragments.append((postcodes[0], oa_geom))
|
||||
single_count += 1
|
||||
continue
|
||||
|
||||
# Get INSPIRE candidates via bbox pre-filter
|
||||
candidates = get_inspire_candidates(
|
||||
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
|
||||
)
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, candidates)
|
||||
all_fragments.extend(fragments)
|
||||
multi_count += 1
|
||||
|
||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||
print(f" Total fragments: {len(all_fragments)}")
|
||||
|
||||
# Free data no longer needed
|
||||
del oa_geoms, uprn_df, uprn_offsets
|
||||
del inspire_bboxes, inspire_offsets, inspire_coords
|
||||
|
||||
# Phase 4: Merge and write
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 4: Merging fragments and writing GeoJSON")
|
||||
print("=" * 60)
|
||||
|
||||
merged = merge_fragments(all_fragments)
|
||||
print(f" Merged into {len(merged)} unique postcodes")
|
||||
|
||||
file_count = write_district_geojson(merged, args.output)
|
||||
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue