idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -1,12 +1,21 @@
|
|||
import argparse
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import shapely
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from tqdm import tqdm
|
||||
|
||||
from .fragments_cache import (
|
||||
fragments_cache_is_fresh,
|
||||
load_fragments,
|
||||
save_fragments,
|
||||
)
|
||||
from .inspire import (
|
||||
build_inspire_index,
|
||||
cache_inspire,
|
||||
get_inspire_candidates,
|
||||
inspire_cache_exists,
|
||||
load_inspire,
|
||||
)
|
||||
|
|
@ -14,7 +23,206 @@ from .memory import release_memory
|
|||
from .oa_boundaries import load_oa_boundaries
|
||||
from .output import merge_fragments, write_district_geojson
|
||||
from .process_oa import process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
|
||||
|
||||
Fragment = tuple[str, Polygon | MultiPolygon]
|
||||
|
||||
|
||||
def _oa_fragments(
|
||||
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||
) -> tuple[list[Fragment], bool]:
|
||||
"""Process one OA into ``(postcode, geometry)`` fragments.
|
||||
|
||||
Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
|
||||
fast path. Shared by the sequential and parallel drivers so both produce
|
||||
identical output. Any failure is re-raised tagged with the OA code so a single
|
||||
bad OA is attributable instead of an anonymous worker abort hours in.
|
||||
"""
|
||||
try:
|
||||
oa_geom = oa_geoms[oa_code]
|
||||
points, postcodes = get_oa_uprns_arrays(
|
||||
east, north, postcodes_arr, offsets, oa_code
|
||||
)
|
||||
if len(set(postcodes)) == 1:
|
||||
return [(postcodes[0], oa_geom)], True
|
||||
candidates = index.candidates(oa_geom.bounds)
|
||||
return process_oa(oa_geom, points, postcodes, candidates), False
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
|
||||
|
||||
|
||||
# Worker-shared state. Populated in the parent before the pool forks; children
|
||||
# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
|
||||
# never duplicated per worker). Read-only in workers.
|
||||
_WORKER_STATE: dict = {}
|
||||
|
||||
|
||||
def _process_oa_chunk(oa_codes: list[str]):
|
||||
"""Worker: turn a chunk of OA codes into WKB-encoded fragments.
|
||||
|
||||
Geometries are returned as WKB (compact and lossless) rather than pickled
|
||||
Shapely objects, to keep the IPC payload small.
|
||||
"""
|
||||
state = _WORKER_STATE
|
||||
frags: list[Fragment] = []
|
||||
single = 0
|
||||
for oa_code in oa_codes:
|
||||
oa_frags, is_single = _oa_fragments(
|
||||
oa_code,
|
||||
state["oa_geoms"],
|
||||
state["east"],
|
||||
state["north"],
|
||||
state["postcodes"],
|
||||
state["offsets"],
|
||||
state["index"],
|
||||
)
|
||||
frags.extend(oa_frags)
|
||||
single += is_single
|
||||
|
||||
if frags:
|
||||
pcs = [pc for pc, _ in frags]
|
||||
wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
|
||||
else:
|
||||
pcs, wkb = [], np.empty(0, dtype=object)
|
||||
return pcs, wkb, single, len(oa_codes)
|
||||
|
||||
|
||||
def _resolve_workers(requested: int) -> int:
|
||||
"""Worker count: the explicit value if >0, otherwise all available CPUs."""
|
||||
if requested and requested > 0:
|
||||
return requested
|
||||
try:
|
||||
return max(1, len(os.sched_getaffinity(0)))
|
||||
except AttributeError:
|
||||
return max(1, os.cpu_count() or 1)
|
||||
|
||||
|
||||
def _process_oas(
|
||||
oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
|
||||
) -> tuple[list[Fragment], int]:
|
||||
"""Drive Phase 3 over every OA, fanning out across `workers` processes.
|
||||
|
||||
OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
|
||||
share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
|
||||
geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
|
||||
does not affect the result (``merge_fragments`` unions per postcode), so
|
||||
chunks are collected as they finish. Returns ``(fragments, single_count)``.
|
||||
"""
|
||||
all_fragments: list[Fragment] = []
|
||||
single_count = 0
|
||||
|
||||
if workers <= 1 or "fork" not in mp.get_all_start_methods():
|
||||
for oa_code in tqdm(
|
||||
oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
|
||||
):
|
||||
oa_frags, is_single = _oa_fragments(
|
||||
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||
)
|
||||
all_fragments.extend(oa_frags)
|
||||
single_count += is_single
|
||||
return all_fragments, single_count
|
||||
|
||||
_WORKER_STATE.update(
|
||||
oa_geoms=oa_geoms,
|
||||
east=east,
|
||||
north=north,
|
||||
postcodes=postcodes_arr,
|
||||
offsets=offsets,
|
||||
index=index,
|
||||
)
|
||||
# Many small contiguous chunks → dynamic load balancing across workers (rural
|
||||
# OAs cost far more than urban ones) while preserving mmap read locality.
|
||||
chunk_size = max(1, len(oa_codes) // (workers * 16))
|
||||
chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
|
||||
print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
|
||||
|
||||
ctx = mp.get_context("fork")
|
||||
try:
|
||||
with ctx.Pool(processes=workers) as pool:
|
||||
with tqdm(
|
||||
total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
|
||||
) as bar:
|
||||
for pcs, wkb, single, n_oas in pool.imap_unordered(
|
||||
_process_oa_chunk, chunks
|
||||
):
|
||||
if len(wkb):
|
||||
all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
|
||||
single_count += single
|
||||
bar.update(n_oas)
|
||||
finally:
|
||||
# Drop references so Phase 4 doesn't keep the big inputs alive.
|
||||
_WORKER_STATE.clear()
|
||||
return all_fragments, single_count
|
||||
|
||||
|
||||
def build_fragments(args: argparse.Namespace) -> list[Fragment]:
|
||||
"""Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
|
||||
|
||||
Returns the full ``(postcode, geometry)`` fragment list. The large
|
||||
intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
|
||||
freed as soon as this function returns -- before the fragments are cached or
|
||||
merged.
|
||||
"""
|
||||
# Phase 1: Load all data
|
||||
print("=" * 60)
|
||||
print("Phase 1: Loading data")
|
||||
print("=" * 60)
|
||||
|
||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||
# Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
|
||||
# call polars (avoids the fork-after-threads hazard of its rayon pool).
|
||||
uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
|
||||
|
||||
# Phase 2: Parse/load INSPIRE
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 2: INSPIRE data")
|
||||
print("=" * 60)
|
||||
|
||||
inspire_cache_dir = args.output / "inspire_cache"
|
||||
if not inspire_cache_exists(inspire_cache_dir):
|
||||
cache_inspire(args.inspire, inspire_cache_dir)
|
||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||
inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
|
||||
|
||||
# Phase 3: Process OAs
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 3: Processing OAs")
|
||||
print("=" * 60)
|
||||
|
||||
# Build work list — precompute which OAs are single vs multi-postcode
|
||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||
|
||||
if args.limit > 0:
|
||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||
|
||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||
|
||||
# --limit is a debug mode → force deterministic single-process.
|
||||
workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
|
||||
all_fragments, single_count = _process_oas(
|
||||
oa_codes_with_data,
|
||||
oa_geoms,
|
||||
uprn_east,
|
||||
uprn_north,
|
||||
uprn_postcodes,
|
||||
uprn_offsets,
|
||||
inspire_index,
|
||||
workers,
|
||||
)
|
||||
multi_count = len(oa_codes_with_data) - single_count
|
||||
|
||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||
print(f" Total fragments: {len(all_fragments)}")
|
||||
|
||||
return all_fragments
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -38,6 +246,12 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--greenspace",
|
||||
type=Path,
|
||||
|
|
@ -46,79 +260,30 @@ def main() -> None:
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Phase 1: Load all data
|
||||
print("=" * 60)
|
||||
print("Phase 1: Loading data")
|
||||
print("=" * 60)
|
||||
fragments_cache = args.output / "fragments_cache.parquet"
|
||||
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
|
||||
# so a greenspace change must not invalidate the fragment cache.
|
||||
fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
|
||||
# --limit yields a partial fragment set; never read or write the shared cache.
|
||||
use_cache = args.limit == 0
|
||||
|
||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||
|
||||
# Phase 2: Parse/load INSPIRE
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 2: INSPIRE data")
|
||||
print("=" * 60)
|
||||
|
||||
inspire_cache_dir = args.output / "inspire_cache"
|
||||
if not inspire_cache_exists(inspire_cache_dir):
|
||||
cache_inspire(args.inspire, inspire_cache_dir)
|
||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||
|
||||
# Phase 3: Process OAs
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 3: Processing OAs")
|
||||
print("=" * 60)
|
||||
|
||||
# Build work list — precompute which OAs are single vs multi-postcode
|
||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||
|
||||
if args.limit > 0:
|
||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||
|
||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||
single_count = 0
|
||||
multi_count = 0
|
||||
|
||||
for oa_code in tqdm(
|
||||
oa_codes_with_data,
|
||||
desc="Processing OAs",
|
||||
unit="OA",
|
||||
smoothing=0.01,
|
||||
miniters=100,
|
||||
):
|
||||
oa_geom = oa_geoms[oa_code]
|
||||
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
|
||||
|
||||
if len(set(postcodes)) == 1:
|
||||
# Fast path: entire OA = one postcode
|
||||
all_fragments.append((postcodes[0], oa_geom))
|
||||
single_count += 1
|
||||
continue
|
||||
|
||||
# Get INSPIRE candidates via bbox pre-filter
|
||||
candidates = get_inspire_candidates(
|
||||
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
|
||||
if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
|
||||
print("=" * 60)
|
||||
print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
|
||||
print("=" * 60)
|
||||
all_fragments = load_fragments(fragments_cache)
|
||||
print(
|
||||
f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
|
||||
)
|
||||
else:
|
||||
all_fragments = build_fragments(args)
|
||||
if use_cache:
|
||||
# Persist the expensive Phase-3 output before the cheap-but-fragile
|
||||
# merge/write so any failure there resumes in seconds, not ~10 hours.
|
||||
save_fragments(fragments_cache, all_fragments)
|
||||
print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}")
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, candidates)
|
||||
all_fragments.extend(fragments)
|
||||
multi_count += 1
|
||||
|
||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||
print(f" Total fragments: {len(all_fragments)}")
|
||||
|
||||
# Free data no longer needed
|
||||
del oa_geoms, uprn_df, uprn_offsets
|
||||
del inspire_bboxes, inspire_offsets, inspire_coords
|
||||
# Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
|
||||
release_memory()
|
||||
|
||||
# Phase 4: Merge and write
|
||||
|
|
@ -145,6 +310,12 @@ def main() -> None:
|
|||
|
||||
file_count = write_district_geojson(merged, args.output)
|
||||
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
|
||||
|
||||
# The cache exists only to survive a crash between Phase 3 and a clean write.
|
||||
# Now that the output is complete, drop it so a later input change can never
|
||||
# be served from a stale cache.
|
||||
if use_cache:
|
||||
fragments_cache.unlink(missing_ok=True)
|
||||
print("Done!")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue