"""Map terminated postcodes to their nearest active successor using OS grid coordinates.""" from pathlib import Path import numpy as np import polars as pl from scipy.spatial import cKDTree def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame: """Build a mapping from terminated England postcodes to their nearest active postcode. Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian metres, so Euclidean distance via cKDTree gives accurate results without projection. """ arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001") active = ( arcgis.filter(pl.col("doterm").is_null()) .select("pcds", "oseast1m", "osnrth1m") .collect() ) terminated = ( arcgis.filter(pl.col("doterm").is_not_null()) .select("pcds", "oseast1m", "osnrth1m") .collect() ) print( f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}" ) if terminated.height == 0: return pl.DataFrame( { "old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8), } ) active_coords = np.column_stack( [active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()] ) terminated_coords = np.column_stack( [terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()] ) tree = cKDTree(active_coords) distances, indices = tree.query(terminated_coords) active_postcodes = active["pcds"] mapping = pl.DataFrame( { "old_postcode": terminated["pcds"], "new_postcode": active_postcodes.gather(indices), } ) print( f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m" ) return mapping