perfect-postcode/pipeline/transform/postcode_boundaries/loader.py

"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.

The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
postcode district under ``units/{district}.geojson``, each feature carrying a
``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
that test points against postcode geometry want those polygons back in British
National Grid (EPSG:27700) so buffers/distances are in metres.

:func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs
invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
postcode so callers can use the array index as a stable postcode id -- the same
"buffer index == postcode index" convention used by ``tree_density``.
"""

from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import shapely
from pyproj import Transformer


def _read_district(
    path: Path, transformer: Transformer
) -> tuple[np.ndarray, np.ndarray]:
    """Return (postcodes, polygons_27700) for one district GeoJSON."""
    with path.open() as file:
        collection = json.load(file)

    features = collection.get("features", [])
    if not features:
        return np.empty(0, dtype=object), np.empty(0, dtype=object)

    postcodes = np.array(
        [feature["properties"]["postcodes"] for feature in features], dtype=object
    )
    geom_json = np.array(
        [json.dumps(feature["geometry"]) for feature in features], dtype=object
    )
    geoms = shapely.from_geojson(geom_json)

    # Reproject every vertex in a single pyproj call, then rebuild the polygons.
    coords = shapely.get_coordinates(geoms)
    if coords.size:
        x, y = transformer.transform(coords[:, 0], coords[:, 1])
        geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))

    invalid = ~shapely.is_valid(geoms)
    if invalid.any():
        geoms[invalid] = shapely.make_valid(geoms[invalid])

    return postcodes, geoms


def load_postcode_polygons(
    units_dir: Path, max_postcodes: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
    """Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.

    Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
    ``max_postcodes`` (testing) keeps only the lexicographically-first N
    postcodes, reading just enough district files to reach the cap.
    """
    units_dir = Path(units_dir)
    files = sorted(units_dir.glob("*.geojson"))
    if not files:
        raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")

    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    postcode_chunks: list[np.ndarray] = []
    geom_chunks: list[np.ndarray] = []
    total = 0
    for path in files:
        postcodes, geoms = _read_district(path, transformer)
        if len(postcodes) == 0:
            continue
        postcode_chunks.append(postcodes)
        geom_chunks.append(geoms)
        total += len(postcodes)
        if max_postcodes is not None and total >= max_postcodes:
            break

    if not postcode_chunks:
        raise ValueError(f"No postcode features found in {units_dir}")

    postcodes = np.concatenate(postcode_chunks)
    geoms = np.concatenate(geom_chunks)

    # Stable postcode order makes "index == postcode id" deterministic; dedupe
    # defensively (a postcode lives in exactly one district file).
    order = np.argsort(postcodes, kind="stable")
    postcodes = postcodes[order]
    geoms = geoms[order]
    _, first = np.unique(postcodes, return_index=True)
    postcodes = postcodes[first]
    geoms = geoms[first]

    if max_postcodes is not None and len(postcodes) > max_postcodes:
        postcodes = postcodes[:max_postcodes]
        geoms = geoms[:max_postcodes]

    print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
    return postcodes, geoms