idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -1,12 +1,21 @@
 import argparse
+import multiprocessing as mp
+import os
 from pathlib import Path

+import numpy as np
+import shapely
 from shapely.geometry import MultiPolygon, Polygon
 from tqdm import tqdm

+from .fragments_cache import (
+    fragments_cache_is_fresh,
+    load_fragments,
+    save_fragments,
+)
 from .inspire import (
+    build_inspire_index,
    cache_inspire,
-    get_inspire_candidates,
    inspire_cache_exists,
    load_inspire,
 )
@ -14,7 +23,206 @@ from .memory import release_memory
 from .oa_boundaries import load_oa_boundaries
 from .output import merge_fragments, write_district_geojson
 from .process_oa import process_oa
-from .uprn import get_oa_uprns, load_uprns
+from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
+
+Fragment = tuple[str, Polygon | MultiPolygon]
+
+
+def _oa_fragments(
+    oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
+) -> tuple[list[Fragment], bool]:
+    """Process one OA into ``(postcode, geometry)`` fragments.
+
+    Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
+    fast path. Shared by the sequential and parallel drivers so both produce
+    identical output. Any failure is re-raised tagged with the OA code so a single
+    bad OA is attributable instead of an anonymous worker abort hours in.
+    """
+    try:
+        oa_geom = oa_geoms[oa_code]
+        points, postcodes = get_oa_uprns_arrays(
+            east, north, postcodes_arr, offsets, oa_code
+        )
+        if len(set(postcodes)) == 1:
+            return [(postcodes[0], oa_geom)], True
+        candidates = index.candidates(oa_geom.bounds)
+        return process_oa(oa_geom, points, postcodes, candidates), False
+    except Exception as exc:
+        raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
+
+
+# Worker-shared state. Populated in the parent before the pool forks; children
+# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
+# never duplicated per worker). Read-only in workers.
+_WORKER_STATE: dict = {}
+
+
+def _process_oa_chunk(oa_codes: list[str]):
+    """Worker: turn a chunk of OA codes into WKB-encoded fragments.
+
+    Geometries are returned as WKB (compact and lossless) rather than pickled
+    Shapely objects, to keep the IPC payload small.
+    """
+    state = _WORKER_STATE
+    frags: list[Fragment] = []
+    single = 0
+    for oa_code in oa_codes:
+        oa_frags, is_single = _oa_fragments(
+            oa_code,
+            state["oa_geoms"],
+            state["east"],
+            state["north"],
+            state["postcodes"],
+            state["offsets"],
+            state["index"],
+        )
+        frags.extend(oa_frags)
+        single += is_single
+
+    if frags:
+        pcs = [pc for pc, _ in frags]
+        wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
+    else:
+        pcs, wkb = [], np.empty(0, dtype=object)
+    return pcs, wkb, single, len(oa_codes)
+
+
+def _resolve_workers(requested: int) -> int:
+    """Worker count: the explicit value if >0, otherwise all available CPUs."""
+    if requested and requested > 0:
+        return requested
+    try:
+        return max(1, len(os.sched_getaffinity(0)))
+    except AttributeError:
+        return max(1, os.cpu_count() or 1)
+
+
+def _process_oas(
+    oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
+) -> tuple[list[Fragment], int]:
+    """Drive Phase 3 over every OA, fanning out across `workers` processes.
+
+    OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
+    share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
+    geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
+    does not affect the result (``merge_fragments`` unions per postcode), so
+    chunks are collected as they finish. Returns ``(fragments, single_count)``.
+    """
+    all_fragments: list[Fragment] = []
+    single_count = 0
+
+    if workers <= 1 or "fork" not in mp.get_all_start_methods():
+        for oa_code in tqdm(
+            oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
+        ):
+            oa_frags, is_single = _oa_fragments(
+                oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
+            )
+            all_fragments.extend(oa_frags)
+            single_count += is_single
+        return all_fragments, single_count
+
+    _WORKER_STATE.update(
+        oa_geoms=oa_geoms,
+        east=east,
+        north=north,
+        postcodes=postcodes_arr,
+        offsets=offsets,
+        index=index,
+    )
+    # Many small contiguous chunks → dynamic load balancing across workers (rural
+    # OAs cost far more than urban ones) while preserving mmap read locality.
+    chunk_size = max(1, len(oa_codes) // (workers * 16))
+    chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
+    print(f"  Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
+
+    ctx = mp.get_context("fork")
+    try:
+        with ctx.Pool(processes=workers) as pool:
+            with tqdm(
+                total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
+            ) as bar:
+                for pcs, wkb, single, n_oas in pool.imap_unordered(
+                    _process_oa_chunk, chunks
+                ):
+                    if len(wkb):
+                        all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
+                    single_count += single
+                    bar.update(n_oas)
+    finally:
+        # Drop references so Phase 4 doesn't keep the big inputs alive.
+        _WORKER_STATE.clear()
+    return all_fragments, single_count
+
+
+def build_fragments(args: argparse.Namespace) -> list[Fragment]:
+    """Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
+
+    Returns the full ``(postcode, geometry)`` fragment list. The large
+    intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
+    freed as soon as this function returns -- before the fragments are cached or
+    merged.
+    """
+    # Phase 1: Load all data
+    print("=" * 60)
+    print("Phase 1: Loading data")
+    print("=" * 60)
+
+    oa_geoms = load_oa_boundaries(args.oa_boundaries)
+    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
+    # Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
+    # call polars (avoids the fork-after-threads hazard of its rayon pool).
+    uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
+
+    # Phase 2: Parse/load INSPIRE
+    print()
+    print("=" * 60)
+    print("Phase 2: INSPIRE data")
+    print("=" * 60)
+
+    inspire_cache_dir = args.output / "inspire_cache"
+    if not inspire_cache_exists(inspire_cache_dir):
+        cache_inspire(args.inspire, inspire_cache_dir)
+    inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
+    inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
+
+    # Phase 3: Process OAs
+    print()
+    print("=" * 60)
+    print("Phase 3: Processing OAs")
+    print("=" * 60)
+
+    # Build work list — precompute which OAs are single vs multi-postcode
+    oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
+    skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
+    skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
+
+    if args.limit > 0:
+        oa_codes_with_data = oa_codes_with_data[: args.limit]
+
+    print(f"  OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
+    print(f"  Skipped (no UPRNs): {skipped_no_uprn}")
+    print(f"  Skipped (no boundary): {skipped_no_boundary}")
+
+    # --limit is a debug mode → force deterministic single-process.
+    workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
+    all_fragments, single_count = _process_oas(
+        oa_codes_with_data,
+        oa_geoms,
+        uprn_east,
+        uprn_north,
+        uprn_postcodes,
+        uprn_offsets,
+        inspire_index,
+        workers,
+    )
+    multi_count = len(oa_codes_with_data) - single_count
+
+    print(f"\n  Single-postcode OAs (fast path): {single_count}")
+    print(f"  Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
+    print(f"  Total fragments: {len(all_fragments)}")
+
+    return all_fragments


 def main() -> None:
@ -38,6 +246,12 @@ def main() -> None:
    parser.add_argument(
        "--limit", type=int, default=0, help="Process only first N OAs (0=all)"
    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=0,
+        help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
+    )
    parser.add_argument(
        "--greenspace",
        type=Path,
@ -46,79 +260,30 @@ def main() -> None:
    )
    args = parser.parse_args()

-    # Phase 1: Load all data
-    print("=" * 60)
-    print("Phase 1: Loading data")
-    print("=" * 60)
+    fragments_cache = args.output / "fragments_cache.parquet"
+    # Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
+    # so a greenspace change must not invalidate the fragment cache.
+    fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
+    # --limit yields a partial fragment set; never read or write the shared cache.
+    use_cache = args.limit == 0

-    oa_geoms = load_oa_boundaries(args.oa_boundaries)
-    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
-
-    # Phase 2: Parse/load INSPIRE
-    print()
-    print("=" * 60)
-    print("Phase 2: INSPIRE data")
-    print("=" * 60)
-
-    inspire_cache_dir = args.output / "inspire_cache"
-    if not inspire_cache_exists(inspire_cache_dir):
-        cache_inspire(args.inspire, inspire_cache_dir)
-    inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
-
-    # Phase 3: Process OAs
-    print()
-    print("=" * 60)
-    print("Phase 3: Processing OAs")
-    print("=" * 60)
-
-    # Build work list — precompute which OAs are single vs multi-postcode
-    oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
-    skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
-    skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
-
-    if args.limit > 0:
-        oa_codes_with_data = oa_codes_with_data[: args.limit]
-
-    print(f"  OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
-    print(f"  Skipped (no UPRNs): {skipped_no_uprn}")
-    print(f"  Skipped (no boundary): {skipped_no_boundary}")
-
-    all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
-    single_count = 0
-    multi_count = 0
-
-    for oa_code in tqdm(
-        oa_codes_with_data,
-        desc="Processing OAs",
-        unit="OA",
-        smoothing=0.01,
-        miniters=100,
-    ):
-        oa_geom = oa_geoms[oa_code]
-        points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
-
-        if len(set(postcodes)) == 1:
-            # Fast path: entire OA = one postcode
-            all_fragments.append((postcodes[0], oa_geom))
-            single_count += 1
-            continue
-
-        # Get INSPIRE candidates via bbox pre-filter
-        candidates = get_inspire_candidates(
-            oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
+    if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
+        print("=" * 60)
+        print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
+        print("=" * 60)
+        all_fragments = load_fragments(fragments_cache)
+        print(
+            f"  Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
        )
+    else:
+        all_fragments = build_fragments(args)
+        if use_cache:
+            # Persist the expensive Phase-3 output before the cheap-but-fragile
+            # merge/write so any failure there resumes in seconds, not ~10 hours.
+            save_fragments(fragments_cache, all_fragments)
+            print(f"  Cached {len(all_fragments):,} fragments to {fragments_cache}")

-        fragments = process_oa(oa_geom, points, postcodes, candidates)
-        all_fragments.extend(fragments)
-        multi_count += 1
-
-    print(f"\n  Single-postcode OAs (fast path): {single_count}")
-    print(f"  Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
-    print(f"  Total fragments: {len(all_fragments)}")
-
-    # Free data no longer needed
-    del oa_geoms, uprn_df, uprn_offsets
-    del inspire_bboxes, inspire_offsets, inspire_coords
+    # Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
    release_memory()

    # Phase 4: Merge and write
@ -145,6 +310,12 @@ def main() -> None:

    file_count = write_district_geojson(merged, args.output)
    print(f"\n  Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
+
+    # The cache exists only to survive a crash between Phase 3 and a clean write.
+    # Now that the output is complete, drop it so a later input change can never
+    # be served from a stale cache.
+    if use_cache:
+        fragments_cache.unlink(missing_ok=True)
    print("Done!")