"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons. The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per postcode district under ``units/{district}.geojson``, each feature carrying a ``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms that test points against postcode geometry want those polygons back in British National Grid (EPSG:27700) so buffers/distances are in metres. :func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by postcode so callers can use the array index as a stable postcode id -- the same "buffer index == postcode index" convention used by ``tree_density``. """ from __future__ import annotations import json from pathlib import Path import numpy as np import shapely from pyproj import Transformer def _read_district( path: Path, transformer: Transformer ) -> tuple[np.ndarray, np.ndarray]: """Return (postcodes, polygons_27700) for one district GeoJSON.""" with path.open() as file: collection = json.load(file) features = collection.get("features", []) if not features: return np.empty(0, dtype=object), np.empty(0, dtype=object) postcodes = np.array( [feature["properties"]["postcodes"] for feature in features], dtype=object ) geom_json = np.array( [json.dumps(feature["geometry"]) for feature in features], dtype=object ) geoms = shapely.from_geojson(geom_json) # Reproject every vertex in a single pyproj call, then rebuild the polygons. coords = shapely.get_coordinates(geoms) if coords.size: x, y = transformer.transform(coords[:, 0], coords[:, 1]) geoms = shapely.set_coordinates(geoms, np.column_stack([x, y])) invalid = ~shapely.is_valid(geoms) if invalid.any(): geoms[invalid] = shapely.make_valid(geoms[invalid]) return postcodes, geoms def load_postcode_polygons( units_dir: Path, max_postcodes: int | None = None ) -> tuple[np.ndarray, np.ndarray]: """Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700. Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode. ``max_postcodes`` (testing) keeps only the lexicographically-first N postcodes, reading just enough district files to reach the cap. """ units_dir = Path(units_dir) files = sorted(units_dir.glob("*.geojson")) if not files: raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}") transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) postcode_chunks: list[np.ndarray] = [] geom_chunks: list[np.ndarray] = [] total = 0 for path in files: postcodes, geoms = _read_district(path, transformer) if len(postcodes) == 0: continue postcode_chunks.append(postcodes) geom_chunks.append(geoms) total += len(postcodes) if max_postcodes is not None and total >= max_postcodes: break if not postcode_chunks: raise ValueError(f"No postcode features found in {units_dir}") postcodes = np.concatenate(postcode_chunks) geoms = np.concatenate(geom_chunks) # Stable postcode order makes "index == postcode id" deterministic; dedupe # defensively (a postcode lives in exactly one district file). order = np.argsort(postcodes, kind="stable") postcodes = postcodes[order] geoms = geoms[order] _, first = np.unique(postcodes, return_index=True) postcodes = postcodes[first] geoms = geoms[first] if max_postcodes is not None and len(postcodes) > max_postcodes: postcodes = postcodes[:max_postcodes] geoms = geoms[:max_postcodes] print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}") return postcodes, geoms