Add postcode mapping

This commit is contained in:
Andras Schmelczer 2026-02-07 19:28:57 +00:00
parent e7f2d1ffc3
commit 4506263e5b
5 changed files with 966 additions and 0 deletions

View file

@ -0,0 +1,715 @@
"""Generate postcode boundary polygons from OA boundaries, INSPIRE parcels, and UPRN data.
Produces per-district GeoJSON files compatible with the Rust server's postcode loader.
Each postcode gets a polygon (or MultiPolygon) guaranteed to be contained within its
Output Area(s), with 100% OA coverage and no overlaps between postcodes within an OA.
Algorithm per OA:
1. Single-postcode OA entire OA polygon assigned to that postcode
2. Multi-postcode OA:
a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote
b. Union INSPIRE parcels per postcode, clip to OA "claimed" area
c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points
d. Final polygon = claimed + Voronoi share
Memory-efficient design (<12GB total):
- INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built
lazily per-OA via numpy bbox pre-filter (~100-500 candidates at a time)
- UPRNs kept as sorted polars DataFrame with offset dict (Arrow storage, ~1.2GB)
- OA processing runs sequentially (no multiprocess INSPIRE duplication)
Output format: {output}/units/{DISTRICT}.geojson with properties.postcodes and
properties.mapit_code fields matching server-rs/src/data/postcodes.rs expectations.
"""
import argparse
import ctypes
import gc
import json
import sqlite3
import zipfile
from collections import Counter, defaultdict
from pathlib import Path
from xml.etree.ElementTree import iterparse
import numpy as np
import polars as pl
from pyproj import Transformer
from scipy.spatial import Voronoi
from shapely import STRtree, make_valid, wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from tqdm import tqdm
# ---------------------------------------------------------------------------
# GeoPackage helpers
# ---------------------------------------------------------------------------
def _release_memory() -> None:
"""Force Python + glibc to release freed memory back to the OS."""
gc.collect()
try:
ctypes.CDLL("libc.so.6").malloc_trim(0)
except OSError:
pass
_ENVELOPE_SIZES = {0: 0, 1: 32, 2: 48, 3: 48, 4: 64}
def parse_gpkg_geometry(blob: bytes):
"""Extract a Shapely geometry from a GeoPackage binary blob."""
flags = blob[3]
envelope_type = (flags >> 1) & 0x07
header_size = 8 + _ENVELOPE_SIZES[envelope_type]
return wkb.loads(blob[header_size:])
def load_oa_boundaries(gpkg_path: Path) -> dict[str, Polygon | MultiPolygon]:
"""Load OA boundary polygons from a GeoPackage. Geometry is already in BNG."""
print("Loading OA boundaries...")
conn = sqlite3.connect(str(gpkg_path))
cur = conn.cursor()
cur.execute("SELECT OA21CD, SHAPE FROM OA_2021_EW_BGC_V2")
oa_geoms: dict[str, Polygon | MultiPolygon] = {}
for oa_code, blob in cur:
geom = parse_gpkg_geometry(bytes(blob))
if geom.geom_type == "MultiPolygon" and len(geom.geoms) == 1:
geom = geom.geoms[0]
oa_geoms[oa_code] = geom
conn.close()
print(f" Loaded {len(oa_geoms)} OA boundaries")
return oa_geoms
# ---------------------------------------------------------------------------
# UPRN loading (memory-efficient: sorted polars DataFrame + offset dict)
# ---------------------------------------------------------------------------
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
"""
import tempfile
print("Loading UPRN lookup...")
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
tmp_path = Path(tmp.name)
(
pl.scan_parquet(uprn_path)
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
.filter(~pl.col("OA21CD").str.starts_with("S"))
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
.with_columns(pl.col("PCDS").str.strip_chars())
.sort("OA21CD")
.sink_parquet(tmp_path)
)
_release_memory()
# Read the sorted data — only one copy in memory (~2GB)
df = pl.read_parquet(tmp_path)
tmp_path.unlink()
n = len(df)
print(f" Loaded {n:,} UPRNs (England & Wales)")
# Compute OA group offsets using polars (avoids 37M Python string creation)
boundary_df = (
df.lazy()
.with_row_index("_i")
.filter(pl.col("OA21CD") != pl.col("OA21CD").shift(1))
.select("_i", "OA21CD")
.collect()
)
starts_list = boundary_df["_i"].to_list()
oa_list = boundary_df["OA21CD"].to_list()
del boundary_df
offsets: dict[str, tuple[int, int]] = {}
for j in range(len(starts_list)):
end = starts_list[j + 1] if j + 1 < len(starts_list) else n
offsets[oa_list[j]] = (starts_list[j], end)
del starts_list, oa_list
# Drop OA column (no longer needed) to save ~400MB
df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
_release_memory()
print(f" Grouped into {len(offsets)} OAs")
return df, offsets
def get_oa_uprns(
df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
) -> tuple[np.ndarray, list[str]]:
"""Get UPRN coordinates and postcodes for a single OA.
Returns (points_nx2, postcodes_list).
"""
s, e = offsets[oa_code]
sub = df[s:e]
points = np.column_stack(
[
sub["GRIDGB1E"].to_numpy(),
sub["GRIDGB1N"].to_numpy(),
]
)
postcodes = sub["PCDS"].to_list()
return points, postcodes
# ---------------------------------------------------------------------------
# INSPIRE GML parsing and caching
# ---------------------------------------------------------------------------
_GML_NS = "{http://www.opengis.net/gml/3.2}"
_LR_NS = "{www.landregistry.gov.uk}"
def parse_inspire_zip(zip_path: Path) -> list[np.ndarray]:
"""Parse a single INSPIRE ZIP → list of Nx2 coordinate arrays (easting, northing)."""
results = []
with zipfile.ZipFile(zip_path) as zf:
gml_names = [n for n in zf.namelist() if n.endswith(".gml")]
if not gml_names:
return results
with zf.open(gml_names[0]) as f:
for event, elem in iterparse(f, events=("end",)):
if elem.tag != f"{_LR_NS}PREDEFINED":
continue
pos_list = elem.find(f".//{_GML_NS}posList")
if pos_list is not None and pos_list.text:
vals = pos_list.text.split()
n = len(vals) // 2
if n >= 3:
coords = np.array(vals, dtype=np.float64).reshape(n, 2)
results.append(coords)
elem.clear()
return results
def cache_inspire(inspire_dir: Path, cache_dir: Path) -> None:
"""Parse all INSPIRE ZIPs and cache as memory-mappable binary files.
Processes ZIPs sequentially to keep memory under control (~2GB peak).
Each ZIP's polygons are streamed to disk immediately after parsing.
Writes three files:
- inspire_bboxes.npy: float64 array (N, 4) of [min_e, min_n, max_e, max_n]
- inspire_offsets.npy: int64 array (N, 2) of [byte_offset, n_points]
- inspire_coords.bin: flat binary of all float64 coordinate pairs
"""
zip_files = sorted(inspire_dir.glob("*.zip"))
print(
f"Parsing {len(zip_files)} INSPIRE ZIP files (sequential, streaming to disk)..."
)
cache_dir.mkdir(parents=True, exist_ok=True)
# Pre-allocate arrays for bboxes and offsets (grow if needed)
capacity = 25_000_000
bboxes = np.empty((capacity, 4), dtype=np.float64)
offsets = np.empty((capacity, 2), dtype=np.int64)
count = 0
byte_offset = 0
coords_path = cache_dir / "inspire_coords.bin"
with open(coords_path, "wb") as cf:
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
for coords in parse_inspire_zip(zip_path):
if count >= capacity:
capacity = int(capacity * 1.5)
bboxes.resize((capacity, 4), refcheck=False)
offsets.resize((capacity, 2), refcheck=False)
bboxes[count, 0] = coords[:, 0].min()
bboxes[count, 1] = coords[:, 1].min()
bboxes[count, 2] = coords[:, 0].max()
bboxes[count, 3] = coords[:, 1].max()
offsets[count, 0] = byte_offset
offsets[count, 1] = len(coords)
raw = coords.astype(np.float64).tobytes()
cf.write(raw)
byte_offset += len(raw)
count += 1
# Trim to actual size and save
bboxes = bboxes[:count]
offsets = offsets[:count]
np.save(cache_dir / "inspire_bboxes.npy", bboxes)
np.save(cache_dir / "inspire_offsets.npy", offsets)
size_mb = byte_offset / (1024 * 1024)
print(f" Cached {count:,} INSPIRE polygons (coords: {size_mb:.0f} MB)")
def _inspire_cache_exists(cache_dir: Path) -> bool:
return all(
(cache_dir / f).exists()
for f in ("inspire_bboxes.npy", "inspire_offsets.npy", "inspire_coords.bin")
)
def load_inspire(
cache_dir: Path,
) -> tuple[np.ndarray, np.ndarray, np.memmap]:
"""Load INSPIRE cache → (bboxes, offsets, coords_mmap).
Memory usage: ~1.1GB (bboxes ~777MB + offsets ~290MB, coords memory-mapped).
"""
print(f"Loading INSPIRE cache from {cache_dir}...")
bboxes = np.load(cache_dir / "inspire_bboxes.npy")
offsets = np.load(cache_dir / "inspire_offsets.npy")
coords_mmap = np.memmap(
cache_dir / "inspire_coords.bin", dtype=np.float64, mode="r"
)
print(
f" Loaded {len(bboxes):,} INSPIRE polygon bboxes (~{bboxes.nbytes // (1024 * 1024)} MB)"
)
print(f" Offsets: ~{offsets.nbytes // (1024 * 1024)} MB, coords: memory-mapped")
return bboxes, offsets, coords_mmap
def get_inspire_candidates(
oa_bounds: tuple[float, float, float, float],
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
) -> list[Polygon]:
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
Builds Shapely objects only for matches (typically 10-500 per OA).
Reads coordinate data on-demand from memory-mapped file.
"""
min_e, min_n, max_e, max_n = oa_bounds
# Vectorized bbox overlap test
mask = (
(bboxes[:, 2] >= min_e)
& (bboxes[:, 0] <= max_e)
& (bboxes[:, 3] >= min_n)
& (bboxes[:, 1] <= max_n)
)
idxs = np.where(mask)[0]
if len(idxs) == 0:
return []
# Build Shapely polygons only for candidates (coords from mmap)
candidates = []
for i in idxs:
byte_offset = offsets[i, 0]
n_pts = offsets[i, 1]
float_offset = byte_offset // 8 # float64 = 8 bytes
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
poly = Polygon(coords)
if not poly.is_valid:
poly = make_valid(poly)
if poly.geom_type == "MultiPolygon":
poly = max(poly.geoms, key=lambda g: g.area)
elif poly.geom_type != "Polygon":
continue
if not poly.is_empty:
candidates.append(poly)
return candidates
# ---------------------------------------------------------------------------
# Voronoi computation
# ---------------------------------------------------------------------------
def compute_voronoi_regions(
points: np.ndarray, postcodes: list[str], boundary: Polygon | MultiPolygon
) -> dict[str, Polygon | MultiPolygon]:
"""Compute Voronoi regions for points, clipped to boundary, grouped by postcode."""
if len(points) == 0:
return {}
if len(points) == 1:
return {postcodes[0]: boundary}
# Deduplicate points per postcode (flats at same coords)
seen: dict[tuple[float, float], str] = {}
unique_pts = []
unique_pcs = []
for i in range(len(points)):
key = (points[i, 0], points[i, 1])
if key not in seen:
seen[key] = postcodes[i]
unique_pts.append(points[i])
unique_pcs.append(postcodes[i])
if len(unique_pts) == 1:
return {unique_pcs[0]: boundary}
pts = np.array(unique_pts)
min_e, min_n = pts.min(axis=0)
max_e, max_n = pts.max(axis=0)
span = max(max_e - min_e, max_n - min_n, 100)
dummy = np.array(
[
[min_e - span * 10, min_n - span * 10],
[max_e + span * 10, min_n - span * 10],
[min_e - span * 10, max_n + span * 10],
[max_e + span * 10, max_n + span * 10],
]
)
all_points = np.vstack([pts, dummy])
try:
vor = Voronoi(all_points)
except Exception:
return {unique_pcs[0]: boundary}
n_real = len(pts)
pc_polys: dict[str, list[Polygon]] = defaultdict(list)
for i in range(n_real):
region_idx = vor.point_region[i]
region = vor.regions[region_idx]
if -1 in region or len(region) < 3:
continue
vertices = vor.vertices[region]
poly = Polygon(vertices)
if not poly.is_valid:
poly = make_valid(poly)
clipped = poly.intersection(boundary)
if not clipped.is_empty:
pc_polys[unique_pcs[i]].append(clipped)
result = {}
for pc, parts in pc_polys.items():
merged = unary_union(parts)
if not merged.is_empty:
result[pc] = merged
return result
# ---------------------------------------------------------------------------
# Per-OA processing
# ---------------------------------------------------------------------------
def process_oa(
oa_geom: Polygon | MultiPolygon,
points: np.ndarray,
postcodes: list[str],
inspire_candidates: list[Polygon],
) -> list[tuple[str, Polygon | MultiPolygon]]:
"""Process a single OA → list of (postcode, geometry) fragments."""
unique_pcs = set(postcodes)
if len(unique_pcs) == 1:
return [(next(iter(unique_pcs)), oa_geom)]
# Try INSPIRE-based assignment
claimed: dict[str, Polygon | MultiPolygon] = {}
if inspire_candidates:
cand_tree = STRtree(inspire_candidates)
from shapely import points as shp_points
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects")
# Majority vote per candidate polygon
cand_postcodes: dict[int, list[str]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list)
for ci, pc_list in cand_postcodes.items():
winner = Counter(pc_list).most_common(1)[0][0]
pc_inspire_polys[winner].append(inspire_candidates[ci])
for pc, polys in pc_inspire_polys.items():
merged = unary_union(polys)
clipped = merged.intersection(oa_geom)
if not clipped.is_empty:
if not clipped.is_valid:
clipped = make_valid(clipped)
claimed[pc] = clipped
# Compute remaining area
if claimed:
all_claimed = unary_union(list(claimed.values()))
if not all_claimed.is_valid:
all_claimed = make_valid(all_claimed)
remaining = oa_geom.difference(all_claimed)
if not remaining.is_valid:
remaining = make_valid(remaining)
else:
remaining = oa_geom
# Distribute remaining area via Voronoi
if not remaining.is_empty and remaining.area > 0.01:
voronoi_result = compute_voronoi_regions(points, postcodes, remaining)
else:
voronoi_result = {}
# Combine claimed + voronoi
result: dict[str, list] = defaultdict(list)
for pc, geom in claimed.items():
result[pc].append(geom)
for pc, geom in voronoi_result.items():
result[pc].append(geom)
fragments = []
for pc, parts in result.items():
merged = unary_union(parts)
if not merged.is_empty:
if not merged.is_valid:
merged = make_valid(merged)
fragments.append((pc, merged))
return fragments
# ---------------------------------------------------------------------------
# Output: merge fragments and write GeoJSON
# ---------------------------------------------------------------------------
_to_wgs84 = None
def _get_to_wgs84():
global _to_wgs84
if _to_wgs84 is None:
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
return _to_wgs84
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
if geom.is_empty:
return None
simplified = geom.simplify(tolerance, preserve_topology=True)
if simplified.is_empty:
return None
transformer = _get_to_wgs84()
def transform_ring(coords):
xs, ys = zip(*coords)
lons, lats = transformer.transform(list(xs), list(ys))
return list(zip(lons, lats))
def transform_polygon(poly):
exterior = transform_ring(poly.exterior.coords)
holes = [transform_ring(h.coords) for h in poly.interiors]
return [exterior] + holes
if simplified.geom_type == "Polygon":
return {
"type": "Polygon",
"coordinates": transform_polygon(simplified),
}
elif simplified.geom_type == "MultiPolygon":
return {
"type": "MultiPolygon",
"coordinates": [transform_polygon(p) for p in simplified.geoms],
}
elif simplified.geom_type == "GeometryCollection":
polys = [
g for g in simplified.geoms if g.geom_type in ("Polygon", "MultiPolygon")
]
if not polys:
return None
return to_wgs84_geojson(unary_union(polys), tolerance=0)
return None
def merge_fragments(
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
) -> dict[str, Polygon | MultiPolygon]:
"""Merge cross-OA fragments for postcodes spanning multiple OAs."""
by_postcode: dict[str, list] = defaultdict(list)
for pc, geom in all_fragments:
by_postcode[pc].append(geom)
merged = {}
for pc, parts in by_postcode.items():
combined = unary_union(parts)
if combined.is_empty:
continue
if not combined.is_valid:
combined = make_valid(combined)
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
if combined.geom_type == "MultiPolygon":
combined = combined.buffer(1.0).buffer(-1.0)
if not combined.is_valid:
combined = make_valid(combined)
# Postcodes are contiguous delivery routes — keep only the largest
# polygon; small detached fragments are algorithm artifacts
if combined.geom_type == "MultiPolygon":
combined = max(combined.geoms, key=lambda g: g.area)
merged[pc] = combined
return merged
def write_district_geojson(
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
) -> int:
"""Group postcodes by district, write GeoJSON files. Returns file count."""
units_dir = output_dir / "units"
units_dir.mkdir(parents=True, exist_ok=True)
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
for pc, geom in postcodes.items():
parts = pc.split()
district = parts[0] if parts else pc[:4]
by_district[district].append((pc, geom))
file_count = 0
for district, entries in tqdm(
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
):
features = []
for pc, geom in sorted(entries, key=lambda x: x[0]):
geojson_geom = to_wgs84_geojson(geom)
if geojson_geom is None:
continue
mapit_code = pc.replace(" ", "")
features.append(
{
"type": "Feature",
"geometry": geojson_geom,
"properties": {
"postcodes": pc,
"mapit_code": mapit_code,
},
}
)
if not features:
continue
collection = {"type": "FeatureCollection", "features": features}
out_path = units_dir / f"{district}.geojson"
with open(out_path, "w") as f:
json.dump(collection, f, separators=(",", ":"))
file_count += 1
return file_count
# ---------------------------------------------------------------------------
# Main orchestration
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
)
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
parser.add_argument(
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
)
parser.add_argument(
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
)
parser.add_argument("--output", type=Path, required=True, help="Output directory")
parser.add_argument(
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
)
args = parser.parse_args()
# Phase 1: Load all data
print("=" * 60)
print("Phase 1: Loading data")
print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn)
# Phase 2: Parse/load INSPIRE
print()
print("=" * 60)
print("Phase 2: INSPIRE data")
print("=" * 60)
inspire_cache_dir = args.output / "inspire_cache"
if not _inspire_cache_exists(inspire_cache_dir):
cache_inspire(args.inspire, inspire_cache_dir)
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
# Phase 3: Process OAs
print()
print("=" * 60)
print("Phase 3: Processing OAs")
print("=" * 60)
# Build work list — precompute which OAs are single vs multi-postcode
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
if args.limit > 0:
oa_codes_with_data = oa_codes_with_data[: args.limit]
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
print(f" Skipped (no boundary): {skipped_no_boundary}")
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
single_count = 0
multi_count = 0
for oa_code in tqdm(
oa_codes_with_data,
desc="Processing OAs",
unit="OA",
smoothing=0.01,
miniters=100,
):
oa_geom = oa_geoms[oa_code]
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
if len(set(postcodes)) == 1:
# Fast path: entire OA = one postcode
all_fragments.append((postcodes[0], oa_geom))
single_count += 1
continue
# Get INSPIRE candidates via bbox pre-filter
candidates = get_inspire_candidates(
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
)
fragments = process_oa(oa_geom, points, postcodes, candidates)
all_fragments.extend(fragments)
multi_count += 1
print(f"\n Single-postcode OAs (fast path): {single_count}")
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
print(f" Total fragments: {len(all_fragments)}")
# Free data no longer needed
del oa_geoms, uprn_df, uprn_offsets
del inspire_bboxes, inspire_offsets, inspire_coords
# Phase 4: Merge and write
print()
print("=" * 60)
print("Phase 4: Merging fragments and writing GeoJSON")
print("=" * 60)
merged = merge_fragments(all_fragments)
print(f" Merged into {len(merged)} unique postcodes")
file_count = write_district_geojson(merged, args.output)
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
print("Done!")
if __name__ == "__main__":
main()