import argparse import multiprocessing as mp import os from pathlib import Path import numpy as np import shapely from shapely.geometry import MultiPolygon, Polygon from tqdm import tqdm from .fragments_cache import ( fragments_cache_is_fresh, load_fragments, save_fragments, ) from .inspire import ( build_inspire_index, cache_inspire, inspire_cache_exists, load_inspire, ) from .memory import release_memory from .oa_boundaries import load_oa_boundaries from .output import merge_fragments, write_district_geojson from .process_oa import process_oa from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns Fragment = tuple[str, Polygon | MultiPolygon] def _oa_fragments( oa_code, oa_geoms, east, north, postcodes_arr, offsets, index ) -> tuple[list[Fragment], bool]: """Process one OA into ``(postcode, geometry)`` fragments. Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode fast path. Shared by the sequential and parallel drivers so both produce identical output. Any failure is re-raised tagged with the OA code so a single bad OA is attributable instead of an anonymous worker abort hours in. """ try: oa_geom = oa_geoms[oa_code] points, postcodes = get_oa_uprns_arrays( east, north, postcodes_arr, offsets, oa_code ) if len(set(postcodes)) == 1: return [(postcodes[0], oa_geom)], True candidates = index.candidates(oa_geom.bounds) return process_oa(oa_geom, points, postcodes, candidates), False except Exception as exc: raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc # Worker-shared state. Populated in the parent before the pool forks; children # inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared, # never duplicated per worker). Read-only in workers. _WORKER_STATE: dict = {} def _process_oa_chunk(oa_codes: list[str]): """Worker: turn a chunk of OA codes into WKB-encoded fragments. Geometries are returned as WKB (compact and lossless) rather than pickled Shapely objects, to keep the IPC payload small. """ state = _WORKER_STATE frags: list[Fragment] = [] single = 0 for oa_code in oa_codes: oa_frags, is_single = _oa_fragments( oa_code, state["oa_geoms"], state["east"], state["north"], state["postcodes"], state["offsets"], state["index"], ) frags.extend(oa_frags) single += is_single if frags: pcs = [pc for pc, _ in frags] wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object)) else: pcs, wkb = [], np.empty(0, dtype=object) return pcs, wkb, single, len(oa_codes) def _resolve_workers(requested: int) -> int: """Worker count: the explicit value if >0, otherwise all available CPUs.""" if requested and requested > 0: return requested try: return max(1, len(os.sched_getaffinity(0))) except AttributeError: return max(1, os.cpu_count() or 1) def _process_oas( oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers ) -> tuple[list[Fragment], int]: """Drive Phase 3 over every OA, fanning out across `workers` processes. OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA geometries) copy-on-write instead of duplicating ~2GB each. Fragment order does not affect the result (``merge_fragments`` unions per postcode), so chunks are collected as they finish. Returns ``(fragments, single_count)``. """ all_fragments: list[Fragment] = [] single_count = 0 if workers <= 1 or "fork" not in mp.get_all_start_methods(): for oa_code in tqdm( oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100 ): oa_frags, is_single = _oa_fragments( oa_code, oa_geoms, east, north, postcodes_arr, offsets, index ) all_fragments.extend(oa_frags) single_count += is_single return all_fragments, single_count _WORKER_STATE.update( oa_geoms=oa_geoms, east=east, north=north, postcodes=postcodes_arr, offsets=offsets, index=index, ) # Many small contiguous chunks → dynamic load balancing across workers (rural # OAs cost far more than urban ones) while preserving mmap read locality. chunk_size = max(1, len(oa_codes) // (workers * 16)) chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)] print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs") ctx = mp.get_context("fork") try: with ctx.Pool(processes=workers) as pool: with tqdm( total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01 ) as bar: for pcs, wkb, single, n_oas in pool.imap_unordered( _process_oa_chunk, chunks ): if len(wkb): all_fragments.extend(zip(pcs, shapely.from_wkb(wkb))) single_count += single bar.update(n_oas) finally: # Drop references so Phase 4 doesn't keep the big inputs alive. _WORKER_STATE.clear() return all_fragments, single_count def build_fragments(args: argparse.Namespace) -> list[Fragment]: """Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments. Returns the full ``(postcode, geometry)`` fragment list. The large intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are freed as soon as this function returns -- before the fragments are cached or merged. """ # Phase 1: Load all data print("=" * 60) print("Phase 1: Loading data") print("=" * 60) oa_geoms = load_oa_boundaries(args.oa_boundaries) uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis) # Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never # call polars (avoids the fork-after-threads hazard of its rayon pool). uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df) # Phase 2: Parse/load INSPIRE print() print("=" * 60) print("Phase 2: INSPIRE data") print("=" * 60) inspire_cache_dir = args.output / "inspire_cache" if not inspire_cache_exists(inspire_cache_dir): cache_inspire(args.inspire, inspire_cache_dir) inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir) inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords) # Phase 3: Process OAs print() print("=" * 60) print("Phase 3: Processing OAs") print("=" * 60) # Build work list — precompute which OAs are single vs multi-postcode oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys())) skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data) skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data) if args.limit > 0: oa_codes_with_data = oa_codes_with_data[: args.limit] print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}") print(f" Skipped (no UPRNs): {skipped_no_uprn}") print(f" Skipped (no boundary): {skipped_no_boundary}") # --limit is a debug mode → force deterministic single-process. workers = 1 if args.limit > 0 else _resolve_workers(args.workers) all_fragments, single_count = _process_oas( oa_codes_with_data, oa_geoms, uprn_east, uprn_north, uprn_postcodes, uprn_offsets, inspire_index, workers, ) multi_count = len(oa_codes_with_data) - single_count print(f"\n Single-postcode OAs (fast path): {single_count}") print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}") print(f" Total fragments: {len(all_fragments)}") return all_fragments def main() -> None: parser = argparse.ArgumentParser( description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data" ) parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet") parser.add_argument( "--arcgis", type=Path, default=None, help="Optional ArcGIS postcode parquet used to remap terminated postcodes", ) parser.add_argument( "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage" ) parser.add_argument( "--inspire", type=Path, required=True, help="INSPIRE ZIP directory" ) parser.add_argument("--output", type=Path, required=True, help="Output directory") parser.add_argument( "--limit", type=int, default=0, help="Process only first N OAs (0=all)" ) parser.add_argument( "--workers", type=int, default=0, help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)", ) parser.add_argument( "--greenspace", type=Path, default=None, help="Greenspace/water parquet for boundary trimming (optional)", ) args = parser.parse_args() fragments_cache = args.output / "fragments_cache.parquet" # Phase 3 depends only on these inputs; greenspace is applied later (Phase 4), # so a greenspace change must not invalidate the fragment cache. fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire] # --limit yields a partial fragment set; never read or write the shared cache. use_cache = args.limit == 0 if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs): print("=" * 60) print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)") print("=" * 60) all_fragments = load_fragments(fragments_cache) print( f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}" ) else: all_fragments = build_fragments(args) if use_cache: # Persist the expensive Phase-3 output before the cheap-but-fragile # merge/write so any failure there resumes in seconds, not ~10 hours. save_fragments(fragments_cache, all_fragments) print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}") # Free Phase-1-3 intermediates (build_fragments' locals) back to the OS. release_memory() # Phase 4: Merge and write print() print("=" * 60) print("Phase 4: Merging fragments and writing GeoJSON") print("=" * 60) greenspace_tree = None greenspace_geoms = None if args.greenspace and args.greenspace.exists(): from .greenspace import load_greenspace print(f" Loading greenspace/water from {args.greenspace}...") greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace) print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons") merged = merge_fragments( all_fragments, greenspace_tree=greenspace_tree, greenspace_geoms=greenspace_geoms, ) print(f" Merged into {len(merged)} unique postcodes") file_count = write_district_geojson(merged, args.output) print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}") # The cache exists only to survive a crash between Phase 3 and a clean write. # Now that the output is complete, drop it so a later input change can never # be served from a stale cache. if use_cache: fragments_cache.unlink(missing_ok=True) print("Done!") if __name__ == "__main__": main()