63 lines
1.8 KiB
Python
63 lines
1.8 KiB
Python
"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
|
|
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
from scipy.spatial import cKDTree
|
|
|
|
|
|
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
|
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
|
|
|
Uses OS National Grid coordinates (east1m, north1m) which are Cartesian metres,
|
|
so Euclidean distance via cKDTree gives accurate results without projection.
|
|
"""
|
|
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry25cd") == "E92000001")
|
|
|
|
active = (
|
|
arcgis.filter(pl.col("doterm").is_null())
|
|
.select("pcds", "east1m", "north1m")
|
|
.collect()
|
|
)
|
|
terminated = (
|
|
arcgis.filter(pl.col("doterm").is_not_null())
|
|
.select("pcds", "east1m", "north1m")
|
|
.collect()
|
|
)
|
|
|
|
print(
|
|
f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
|
|
)
|
|
|
|
if terminated.height == 0:
|
|
return pl.DataFrame(
|
|
{
|
|
"old_postcode": pl.Series([], dtype=pl.Utf8),
|
|
"new_postcode": pl.Series([], dtype=pl.Utf8),
|
|
}
|
|
)
|
|
|
|
active_coords = np.column_stack(
|
|
[active["east1m"].to_numpy(), active["north1m"].to_numpy()]
|
|
)
|
|
terminated_coords = np.column_stack(
|
|
[terminated["east1m"].to_numpy(), terminated["north1m"].to_numpy()]
|
|
)
|
|
|
|
tree = cKDTree(active_coords)
|
|
distances, indices = tree.query(terminated_coords)
|
|
|
|
active_postcodes = active["pcds"]
|
|
mapping = pl.DataFrame(
|
|
{
|
|
"old_postcode": terminated["pcds"],
|
|
"new_postcode": active_postcodes.gather(indices),
|
|
}
|
|
)
|
|
|
|
print(
|
|
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
|
|
)
|
|
|
|
return mapping
|