This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -15,26 +15,49 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
active = (
arcgis.filter(pl.col("doterm").is_null())
.select("pcds", "oseast1m", "osnrth1m")
.collect()
)
terminated = (
arcgis.filter(pl.col("doterm").is_not_null())
.select("pcds", "oseast1m", "osnrth1m")
.collect()
)
print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
print(
f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
)
if terminated.height == 0:
return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
return pl.DataFrame(
{
"old_postcode": pl.Series([], dtype=pl.Utf8),
"new_postcode": pl.Series([], dtype=pl.Utf8),
}
)
active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
active_coords = np.column_stack(
[active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()]
)
terminated_coords = np.column_stack(
[terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()]
)
tree = cKDTree(active_coords)
distances, indices = tree.query(terminated_coords)
active_postcodes = active["pcds"]
mapping = pl.DataFrame({
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
})
mapping = pl.DataFrame(
{
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
}
)
print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
print(
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
)
return mapping