This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -2,6 +2,7 @@ from .download import download, extract_zip
from .fuzzy_join import fuzzy_join_on_postcode
from .haversine import haversine_km, haversine_km_expr
from .poi_counts import count_pois_per_postcode
from .postcode_mapping import build_postcode_mapping
__all__ = [
"download",
@ -10,4 +11,5 @@ __all__ = [
"haversine_km",
"haversine_km_expr",
"count_pois_per_postcode",
"build_postcode_mapping",
]

View file

@ -154,14 +154,16 @@ def fuzzy_join_on_postcode(
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
return (
result = (
left_cached.join(mapping, on="_left_idx", how="left")
.join(right_cached, on="_right_idx", how="left")
.drop("_left_idx", "_right_idx")
.collect(engine="streaming")
)
except BaseException:
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
raise
return result.lazy()
def _numbers_compatible(a: str, b: str) -> bool:
@ -180,7 +182,7 @@ def _numbers_compatible(a: str, b: str) -> bool:
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries = args

View file

@ -0,0 +1,40 @@
"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""Build a mapping from terminated England postcodes to their nearest active postcode.
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian meters,
so Euclidean distance via cKDTree gives accurate results without projection.
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
if terminated.height == 0:
return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
tree = cKDTree(active_coords)
distances, indices = tree.query(terminated_coords)
active_postcodes = active["pcds"]
mapping = pl.DataFrame({
"old_postcode": terminated["pcds"],
"new_postcode": active_postcodes.gather(indices),
})
print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
return mapping

View file

@ -1,7 +1,14 @@
import polars as pl
import pytest
from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
from pipeline.utils.poi_counts import count_pois_per_postcode
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Supermarket"],
"parks": ["Park"],
"public_transport": ["Station"],
}
@pytest.fixture
@ -24,41 +31,39 @@ def pois():
@pytest.fixture
def properties():
"""Two properties at the same postcode near central London, one at a distant postcode."""
def postcodes():
"""Two postcodes: one near central London, one far away."""
return pl.DataFrame(
{
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
"lat": [51.5074, 51.5074, 55.0],
"lon": [-0.1278, -0.1278, -3.0],
"postcode": ["EC1A 1BB", "ZZ99 9ZZ"],
"lat": [51.5074, 55.0],
"lon": [-0.1278, -3.0],
}
)
def test_counts_pois_within_radius(properties, pois):
result = count_pois_within_radius(properties, pois, radius_km=2.0)
def test_counts_pois_within_radius(postcodes, pois):
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2.0)
assert set(result.keys()) == {f"{g}_2km" for g in POI_GROUPS}
expected_cols = {f"{g}_2km" for g in POI_GROUPS}
assert expected_cols.issubset(set(result.columns))
# Result Series must be aligned to properties (3 rows)
for col, series in result.items():
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
# Result must be aligned to postcodes (2 rows)
assert len(result) == 2
# First two rows share a postcode near the central London cluster
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
assert result["groceries_2km"][0] == 1 # Supermarket
assert result["parks_2km"][0] == 1 # Park
assert result["public_transport_2km"][0] == 1 # Station
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
assert ec1a["restaurants_2km"][0] == 2 # Restaurant + Fast Food
assert ec1a["groceries_2km"][0] == 1 # Supermarket
assert ec1a["parks_2km"][0] == 1 # Park
assert ec1a["public_transport_2km"][0] == 1 # Station
# Second row is the same postcode, so same counts
assert result["restaurants_2km"][1] == result["restaurants_2km"][0]
# Third row (ZZ99 9ZZ) is far from all POIs → zero counts
# Far-away postcode should have zero counts
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
for group in POI_GROUPS:
assert result[f"{group}_2km"][2] == 0
assert zz99[f"{group}_2km"][0] == 0
def test_no_pois_returns_zeros(properties):
def test_no_pois_returns_zeros(postcodes):
empty_pois = pl.DataFrame(
{
"lat": pl.Series([], dtype=pl.Float64),
@ -66,17 +71,17 @@ def test_no_pois_returns_zeros(properties):
"category": pl.Series([], dtype=pl.String),
}
)
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
result = count_pois_per_postcode(postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0)
for group in POI_GROUPS:
col = f"{group}_2km"
assert col in result
assert result[col].to_list() == [0, 0, 0]
assert col in result.columns
assert result[col].to_list() == [0, 0]
def test_custom_radius(pois):
"""A tiny radius should exclude POIs that are even slightly away."""
properties = pl.DataFrame(
postcodes = pl.DataFrame(
{
"postcode": ["EC1A 1BB"],
"lat": [51.5074],
@ -85,7 +90,7 @@ def test_custom_radius(pois):
)
# 0.01 km = 10m — only the POI at the exact same location should match
result = count_pois_within_radius(properties, pois, radius_km=0.01)
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=0.01)
# The Restaurant at (51.5074, -0.1278) is at distance 0
assert result["restaurants_0km"][0] >= 1
# POIs >100m away should not be counted