perfect-postcode/pipeline/utils/postcode_mapping.py

"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""

from pathlib import Path

import numpy as np
import polars as pl
from scipy.spatial import cKDTree

# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
# nearest active successor to be remapped. Beyond this we treat the postcode as having no
# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
# misattributions; dropped postcodes keep their terminated code and fall out at the
# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
MAX_REMAP_DISTANCE_M = 1000.0


def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
    """Build a mapping from terminated England postcodes to their nearest active postcode.

    Uses OS National Grid coordinates (east1m, north1m) which are Cartesian metres,
    so Euclidean distance via cKDTree gives accurate results without projection.
    """
    arcgis = (
        pl.scan_parquet(arcgis_path)
        .filter(pl.col("ctry25cd") == "E92000001")
        .with_columns(pl.col("doterm").cast(pl.Utf8).alias("doterm"))
    )

    active = (
        arcgis.filter(pl.col("doterm").is_null())
        .select("pcds", "east1m", "north1m")
        .collect()
    )
    terminated = (
        arcgis.filter(pl.col("doterm").is_not_null())
        .select("pcds", "east1m", "north1m")
        .collect()
    )

    print(
        f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
    )

    if terminated.height == 0:
        return pl.DataFrame(
            {
                "old_postcode": pl.Series([], dtype=pl.Utf8),
                "new_postcode": pl.Series([], dtype=pl.Utf8),
            }
        )

    active_coords = np.column_stack(
        [active["east1m"].to_numpy(), active["north1m"].to_numpy()]
    )
    terminated_coords = np.column_stack(
        [terminated["east1m"].to_numpy(), terminated["north1m"].to_numpy()]
    )

    tree = cKDTree(active_coords)
    distances, indices = tree.query(
        terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
    )

    # cKDTree returns distance=inf and index==len(active) for points with no neighbour
    # within the bound. Drop those terminated postcodes rather than gather an out-of-range
    # index; they keep their terminated code and fall out at the active-postcode filter.
    within_bound = np.isfinite(distances)
    dropped = int((~within_bound).sum())

    active_postcodes = active["pcds"]
    mapping = pl.DataFrame(
        {
            "old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
            "new_postcode": active_postcodes.gather(indices[within_bound]),
        }
    )

    kept_distances = distances[within_bound]
    print(
        f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
        f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
        if kept_distances.size
        else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
    )

    return mapping