perfect-postcode/pipeline/utils/postcode_mapping.py
2026-03-15 21:22:28 +00:00

63 lines
1.9 KiB
Python

"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""Build a mapping from terminated England postcodes to their nearest active postcode.
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian metres,
so Euclidean distance via cKDTree gives accurate results without projection.
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
active = (
arcgis.filter(pl.col("doterm").is_null())
.select("pcds", "oseast1m", "osnrth1m")
.collect()
)
terminated = (
arcgis.filter(pl.col("doterm").is_not_null())
.select("pcds", "oseast1m", "osnrth1m")
.collect()
)
print(
f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
)
if terminated.height == 0:
return pl.DataFrame(
{
"old_postcode": pl.Series([], dtype=pl.Utf8),
"new_postcode": pl.Series([], dtype=pl.Utf8),
}
)
active_coords = np.column_stack(
[active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()]
)
terminated_coords = np.column_stack(
[terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()]
)
tree = cKDTree(active_coords)
distances, indices = tree.query(terminated_coords)
active_postcodes = active["pcds"]
mapping = pl.DataFrame(
{
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
}
)
print(
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
)
return mapping