perfect-postcode/pipeline/transform/postcode_boundaries/__main__.py
2026-06-02 13:46:18 +01:00

323 lines
12 KiB
Python

import argparse
import multiprocessing as mp
import os
from pathlib import Path
import numpy as np
import shapely
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
from .fragments_cache import (
fragments_cache_is_fresh,
load_fragments,
save_fragments,
)
from .inspire import (
build_inspire_index,
cache_inspire,
inspire_cache_exists,
load_inspire,
)
from .memory import release_memory
from .oa_boundaries import load_oa_boundaries
from .output import merge_fragments, write_district_geojson
from .process_oa import process_oa
from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
Fragment = tuple[str, Polygon | MultiPolygon]
def _oa_fragments(
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
) -> tuple[list[Fragment], bool]:
"""Process one OA into ``(postcode, geometry)`` fragments.
Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
fast path. Shared by the sequential and parallel drivers so both produce
identical output. Any failure is re-raised tagged with the OA code so a single
bad OA is attributable instead of an anonymous worker abort hours in.
"""
try:
oa_geom = oa_geoms[oa_code]
points, postcodes = get_oa_uprns_arrays(
east, north, postcodes_arr, offsets, oa_code
)
if len(set(postcodes)) == 1:
return [(postcodes[0], oa_geom)], True
candidates = index.candidates(oa_geom.bounds)
return process_oa(oa_geom, points, postcodes, candidates), False
except Exception as exc:
raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
# Worker-shared state. Populated in the parent before the pool forks; children
# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
# never duplicated per worker). Read-only in workers.
_WORKER_STATE: dict = {}
def _process_oa_chunk(oa_codes: list[str]):
"""Worker: turn a chunk of OA codes into WKB-encoded fragments.
Geometries are returned as WKB (compact and lossless) rather than pickled
Shapely objects, to keep the IPC payload small.
"""
state = _WORKER_STATE
frags: list[Fragment] = []
single = 0
for oa_code in oa_codes:
oa_frags, is_single = _oa_fragments(
oa_code,
state["oa_geoms"],
state["east"],
state["north"],
state["postcodes"],
state["offsets"],
state["index"],
)
frags.extend(oa_frags)
single += is_single
if frags:
pcs = [pc for pc, _ in frags]
wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
else:
pcs, wkb = [], np.empty(0, dtype=object)
return pcs, wkb, single, len(oa_codes)
def _resolve_workers(requested: int) -> int:
"""Worker count: the explicit value if >0, otherwise all available CPUs."""
if requested and requested > 0:
return requested
try:
return max(1, len(os.sched_getaffinity(0)))
except AttributeError:
return max(1, os.cpu_count() or 1)
def _process_oas(
oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
) -> tuple[list[Fragment], int]:
"""Drive Phase 3 over every OA, fanning out across `workers` processes.
OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
does not affect the result (``merge_fragments`` unions per postcode), so
chunks are collected as they finish. Returns ``(fragments, single_count)``.
"""
all_fragments: list[Fragment] = []
single_count = 0
if workers <= 1 or "fork" not in mp.get_all_start_methods():
for oa_code in tqdm(
oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
):
oa_frags, is_single = _oa_fragments(
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
)
all_fragments.extend(oa_frags)
single_count += is_single
return all_fragments, single_count
_WORKER_STATE.update(
oa_geoms=oa_geoms,
east=east,
north=north,
postcodes=postcodes_arr,
offsets=offsets,
index=index,
)
# Many small contiguous chunks → dynamic load balancing across workers (rural
# OAs cost far more than urban ones) while preserving mmap read locality.
chunk_size = max(1, len(oa_codes) // (workers * 16))
chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
ctx = mp.get_context("fork")
try:
with ctx.Pool(processes=workers) as pool:
with tqdm(
total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
) as bar:
for pcs, wkb, single, n_oas in pool.imap_unordered(
_process_oa_chunk, chunks
):
if len(wkb):
all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
single_count += single
bar.update(n_oas)
finally:
# Drop references so Phase 4 doesn't keep the big inputs alive.
_WORKER_STATE.clear()
return all_fragments, single_count
def build_fragments(args: argparse.Namespace) -> list[Fragment]:
"""Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
Returns the full ``(postcode, geometry)`` fragment list. The large
intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
freed as soon as this function returns -- before the fragments are cached or
merged.
"""
# Phase 1: Load all data
print("=" * 60)
print("Phase 1: Loading data")
print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
# Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
# call polars (avoids the fork-after-threads hazard of its rayon pool).
uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
# Phase 2: Parse/load INSPIRE
print()
print("=" * 60)
print("Phase 2: INSPIRE data")
print("=" * 60)
inspire_cache_dir = args.output / "inspire_cache"
if not inspire_cache_exists(inspire_cache_dir):
cache_inspire(args.inspire, inspire_cache_dir)
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
# Phase 3: Process OAs
print()
print("=" * 60)
print("Phase 3: Processing OAs")
print("=" * 60)
# Build work list — precompute which OAs are single vs multi-postcode
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
if args.limit > 0:
oa_codes_with_data = oa_codes_with_data[: args.limit]
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
print(f" Skipped (no boundary): {skipped_no_boundary}")
# --limit is a debug mode → force deterministic single-process.
workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
all_fragments, single_count = _process_oas(
oa_codes_with_data,
oa_geoms,
uprn_east,
uprn_north,
uprn_postcodes,
uprn_offsets,
inspire_index,
workers,
)
multi_count = len(oa_codes_with_data) - single_count
print(f"\n Single-postcode OAs (fast path): {single_count}")
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
print(f" Total fragments: {len(all_fragments)}")
return all_fragments
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
)
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
parser.add_argument(
"--arcgis",
type=Path,
default=None,
help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
)
parser.add_argument(
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
)
parser.add_argument(
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
)
parser.add_argument("--output", type=Path, required=True, help="Output directory")
parser.add_argument(
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
)
parser.add_argument(
"--workers",
type=int,
default=0,
help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
)
parser.add_argument(
"--greenspace",
type=Path,
default=None,
help="Greenspace/water parquet for boundary trimming (optional)",
)
args = parser.parse_args()
fragments_cache = args.output / "fragments_cache.parquet"
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
# so a greenspace change must not invalidate the fragment cache.
fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
# --limit yields a partial fragment set; never read or write the shared cache.
use_cache = args.limit == 0
if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
print("=" * 60)
print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
print("=" * 60)
all_fragments = load_fragments(fragments_cache)
print(
f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
)
else:
all_fragments = build_fragments(args)
if use_cache:
# Persist the expensive Phase-3 output before the cheap-but-fragile
# merge/write so any failure there resumes in seconds, not ~10 hours.
save_fragments(fragments_cache, all_fragments)
print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}")
# Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
release_memory()
# Phase 4: Merge and write
print()
print("=" * 60)
print("Phase 4: Merging fragments and writing GeoJSON")
print("=" * 60)
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")
greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace)
print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons")
merged = merge_fragments(
all_fragments,
greenspace_tree=greenspace_tree,
greenspace_geoms=greenspace_geoms,
)
print(f" Merged into {len(merged)} unique postcodes")
file_count = write_district_geojson(merged, args.output)
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
# The cache exists only to survive a crash between Phase 3 and a clean write.
# Now that the output is complete, drop it so a later input change can never
# be served from a stale cache.
if use_cache:
fragments_cache.unlink(missing_ok=True)
print("Done!")
if __name__ == "__main__":
main()