This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -6,6 +6,16 @@ import numpy as np
import polars as pl
from scipy.spatial import cKDTree
# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
# nearest active successor to be remapped. Beyond this we treat the postcode as having no
# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
# misattributions; dropped postcodes keep their terminated code and fall out at the
# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
MAX_REMAP_DISTANCE_M = 1000.0
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""Build a mapping from terminated England postcodes to their nearest active postcode.
@ -50,18 +60,30 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
)
tree = cKDTree(active_coords)
distances, indices = tree.query(terminated_coords)
distances, indices = tree.query(
terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
)
# cKDTree returns distance=inf and index==len(active) for points with no neighbour
# within the bound. Drop those terminated postcodes rather than gather an out-of-range
# index; they keep their terminated code and fall out at the active-postcode filter.
within_bound = np.isfinite(distances)
dropped = int((~within_bound).sum())
active_postcodes = active["pcds"]
mapping = pl.DataFrame(
{
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
"old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
"new_postcode": active_postcodes.gather(indices[within_bound]),
}
)
kept_distances = distances[within_bound]
print(
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
if kept_distances.size
else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
)
return mapping