89 lines
3.4 KiB
Python
89 lines
3.4 KiB
Python
"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
|
|
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
from scipy.spatial import cKDTree
|
|
|
|
# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
|
|
# nearest active successor to be remapped. Beyond this we treat the postcode as having no
|
|
# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
|
|
# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
|
|
# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
|
|
# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
|
|
# misattributions; dropped postcodes keep their terminated code and fall out at the
|
|
# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
|
|
MAX_REMAP_DISTANCE_M = 1000.0
|
|
|
|
|
|
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
|
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
|
|
|
Uses OS National Grid coordinates (east1m, north1m) which are Cartesian metres,
|
|
so Euclidean distance via cKDTree gives accurate results without projection.
|
|
"""
|
|
arcgis = (
|
|
pl.scan_parquet(arcgis_path)
|
|
.filter(pl.col("ctry25cd") == "E92000001")
|
|
.with_columns(pl.col("doterm").cast(pl.Utf8).alias("doterm"))
|
|
)
|
|
|
|
active = (
|
|
arcgis.filter(pl.col("doterm").is_null())
|
|
.select("pcds", "east1m", "north1m")
|
|
.collect()
|
|
)
|
|
terminated = (
|
|
arcgis.filter(pl.col("doterm").is_not_null())
|
|
.select("pcds", "east1m", "north1m")
|
|
.collect()
|
|
)
|
|
|
|
print(
|
|
f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
|
|
)
|
|
|
|
if terminated.height == 0:
|
|
return pl.DataFrame(
|
|
{
|
|
"old_postcode": pl.Series([], dtype=pl.Utf8),
|
|
"new_postcode": pl.Series([], dtype=pl.Utf8),
|
|
}
|
|
)
|
|
|
|
active_coords = np.column_stack(
|
|
[active["east1m"].to_numpy(), active["north1m"].to_numpy()]
|
|
)
|
|
terminated_coords = np.column_stack(
|
|
[terminated["east1m"].to_numpy(), terminated["north1m"].to_numpy()]
|
|
)
|
|
|
|
tree = cKDTree(active_coords)
|
|
distances, indices = tree.query(
|
|
terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
|
|
)
|
|
|
|
# cKDTree returns distance=inf and index==len(active) for points with no neighbour
|
|
# within the bound. Drop those terminated postcodes rather than gather an out-of-range
|
|
# index; they keep their terminated code and fall out at the active-postcode filter.
|
|
within_bound = np.isfinite(distances)
|
|
dropped = int((~within_bound).sum())
|
|
|
|
active_postcodes = active["pcds"]
|
|
mapping = pl.DataFrame(
|
|
{
|
|
"old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
|
|
"new_postcode": active_postcodes.gather(indices[within_bound]),
|
|
}
|
|
)
|
|
|
|
kept_distances = distances[within_bound]
|
|
print(
|
|
f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
|
|
f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
|
|
if kept_distances.size
|
|
else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
|
|
)
|
|
|
|
return mapping
|