This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -0,0 +1,40 @@
"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""Build a mapping from terminated England postcodes to their nearest active postcode.
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian meters,
so Euclidean distance via cKDTree gives accurate results without projection.
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
if terminated.height == 0:
return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
tree = cKDTree(active_coords)
distances, indices = tree.query(terminated_coords)
active_postcodes = active["pcds"]
mapping = pl.DataFrame({
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
})
print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
return mapping