perfect-postcode/pipeline/transform/postcode_boundaries/__main__.py
2026-02-15 22:39:53 +00:00

146 lines
4.6 KiB
Python

import argparse
from pathlib import Path
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
from .inspire import (
cache_inspire,
get_inspire_candidates,
inspire_cache_exists,
load_inspire,
)
from .memory import release_memory
from .oa_boundaries import load_oa_boundaries
from .output import merge_fragments, write_district_geojson
from .process_oa import process_oa
from .uprn import get_oa_uprns, load_uprns
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
)
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
parser.add_argument(
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
)
parser.add_argument(
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
)
parser.add_argument("--output", type=Path, required=True, help="Output directory")
parser.add_argument(
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
)
parser.add_argument(
"--greenspace",
type=Path,
default=None,
help="Greenspace/water parquet for boundary trimming (optional)",
)
args = parser.parse_args()
# Phase 1: Load all data
print("=" * 60)
print("Phase 1: Loading data")
print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn)
# Phase 2: Parse/load INSPIRE
print()
print("=" * 60)
print("Phase 2: INSPIRE data")
print("=" * 60)
inspire_cache_dir = args.output / "inspire_cache"
if not inspire_cache_exists(inspire_cache_dir):
cache_inspire(args.inspire, inspire_cache_dir)
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
# Phase 3: Process OAs
print()
print("=" * 60)
print("Phase 3: Processing OAs")
print("=" * 60)
# Build work list — precompute which OAs are single vs multi-postcode
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
if args.limit > 0:
oa_codes_with_data = oa_codes_with_data[: args.limit]
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
print(f" Skipped (no boundary): {skipped_no_boundary}")
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
single_count = 0
multi_count = 0
for oa_code in tqdm(
oa_codes_with_data,
desc="Processing OAs",
unit="OA",
smoothing=0.01,
miniters=100,
):
oa_geom = oa_geoms[oa_code]
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
if len(set(postcodes)) == 1:
# Fast path: entire OA = one postcode
all_fragments.append((postcodes[0], oa_geom))
single_count += 1
continue
# Get INSPIRE candidates via bbox pre-filter
candidates = get_inspire_candidates(
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
)
fragments = process_oa(oa_geom, points, postcodes, candidates)
all_fragments.extend(fragments)
multi_count += 1
print(f"\n Single-postcode OAs (fast path): {single_count}")
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
print(f" Total fragments: {len(all_fragments)}")
# Free data no longer needed
del oa_geoms, uprn_df, uprn_offsets
del inspire_bboxes, inspire_offsets, inspire_coords
release_memory()
# Phase 4: Merge and write
print()
print("=" * 60)
print("Phase 4: Merging fragments and writing GeoJSON")
print("=" * 60)
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")
greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace)
print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons")
merged = merge_fragments(
all_fragments,
greenspace_tree=greenspace_tree,
greenspace_geoms=greenspace_geoms,
)
print(f" Merged into {len(merged)} unique postcodes")
file_count = write_district_geojson(merged, args.output)
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
print("Done!")
if __name__ == "__main__":
main()