lmao
This commit is contained in:
parent
03445188ea
commit
524580eb25
102 changed files with 36625 additions and 1295 deletions
|
|
@ -2,6 +2,7 @@ from .download import download, extract_zip
|
|||
from .fuzzy_join import fuzzy_join_on_postcode
|
||||
from .haversine import haversine_km, haversine_km_expr
|
||||
from .poi_counts import count_pois_per_postcode
|
||||
from .postcode_mapping import build_postcode_mapping
|
||||
|
||||
__all__ = [
|
||||
"download",
|
||||
|
|
@ -10,4 +11,5 @@ __all__ = [
|
|||
"haversine_km",
|
||||
"haversine_km_expr",
|
||||
"count_pois_per_postcode",
|
||||
"build_postcode_mapping",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -154,14 +154,16 @@ def fuzzy_join_on_postcode(
|
|||
left_cached = pl.scan_parquet(left_path)
|
||||
right_cached = pl.scan_parquet(right_path)
|
||||
|
||||
return (
|
||||
result = (
|
||||
left_cached.join(mapping, on="_left_idx", how="left")
|
||||
.join(right_cached, on="_right_idx", how="left")
|
||||
.drop("_left_idx", "_right_idx")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
except BaseException:
|
||||
finally:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
raise
|
||||
|
||||
return result.lazy()
|
||||
|
||||
|
||||
def _numbers_compatible(a: str, b: str) -> bool:
|
||||
|
|
@ -180,7 +182,7 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
left_entries, right_entries = args
|
||||
|
|
|
|||
40
pipeline/utils/postcode_mapping.py
Normal file
40
pipeline/utils/postcode_mapping.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""Map terminated postcodes to their nearest active successor using OS grid coordinates."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
|
||||
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
||||
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
||||
|
||||
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian meters,
|
||||
so Euclidean distance via cKDTree gives accurate results without projection.
|
||||
"""
|
||||
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
|
||||
|
||||
active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
|
||||
terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
|
||||
|
||||
print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
|
||||
|
||||
if terminated.height == 0:
|
||||
return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
|
||||
|
||||
active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
|
||||
terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
|
||||
|
||||
tree = cKDTree(active_coords)
|
||||
distances, indices = tree.query(terminated_coords)
|
||||
|
||||
active_postcodes = active["pcds"]
|
||||
mapping = pl.DataFrame({
|
||||
"old_postcode": terminated["pcds"],
|
||||
"new_postcode": active_postcodes.gather(indices),
|
||||
})
|
||||
|
||||
print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
|
||||
|
||||
return mapping
|
||||
|
|
@ -1,7 +1,14 @@
|
|||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
POI_GROUPS = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Supermarket"],
|
||||
"parks": ["Park"],
|
||||
"public_transport": ["Station"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -24,41 +31,39 @@ def pois():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def properties():
|
||||
"""Two properties at the same postcode near central London, one at a distant postcode."""
|
||||
def postcodes():
|
||||
"""Two postcodes: one near central London, one far away."""
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
|
||||
"lat": [51.5074, 51.5074, 55.0],
|
||||
"lon": [-0.1278, -0.1278, -3.0],
|
||||
"postcode": ["EC1A 1BB", "ZZ99 9ZZ"],
|
||||
"lat": [51.5074, 55.0],
|
||||
"lon": [-0.1278, -3.0],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_counts_pois_within_radius(properties, pois):
|
||||
result = count_pois_within_radius(properties, pois, radius_km=2.0)
|
||||
def test_counts_pois_within_radius(postcodes, pois):
|
||||
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2.0)
|
||||
|
||||
assert set(result.keys()) == {f"{g}_2km" for g in POI_GROUPS}
|
||||
expected_cols = {f"{g}_2km" for g in POI_GROUPS}
|
||||
assert expected_cols.issubset(set(result.columns))
|
||||
|
||||
# Result Series must be aligned to properties (3 rows)
|
||||
for col, series in result.items():
|
||||
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
|
||||
# Result must be aligned to postcodes (2 rows)
|
||||
assert len(result) == 2
|
||||
|
||||
# First two rows share a postcode near the central London cluster
|
||||
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||
assert result["groceries_2km"][0] == 1 # Supermarket
|
||||
assert result["parks_2km"][0] == 1 # Park
|
||||
assert result["public_transport_2km"][0] == 1 # Station
|
||||
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
|
||||
assert ec1a["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||
assert ec1a["groceries_2km"][0] == 1 # Supermarket
|
||||
assert ec1a["parks_2km"][0] == 1 # Park
|
||||
assert ec1a["public_transport_2km"][0] == 1 # Station
|
||||
|
||||
# Second row is the same postcode, so same counts
|
||||
assert result["restaurants_2km"][1] == result["restaurants_2km"][0]
|
||||
|
||||
# Third row (ZZ99 9ZZ) is far from all POIs → zero counts
|
||||
# Far-away postcode should have zero counts
|
||||
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
||||
for group in POI_GROUPS:
|
||||
assert result[f"{group}_2km"][2] == 0
|
||||
assert zz99[f"{group}_2km"][0] == 0
|
||||
|
||||
|
||||
def test_no_pois_returns_zeros(properties):
|
||||
def test_no_pois_returns_zeros(postcodes):
|
||||
empty_pois = pl.DataFrame(
|
||||
{
|
||||
"lat": pl.Series([], dtype=pl.Float64),
|
||||
|
|
@ -66,17 +71,17 @@ def test_no_pois_returns_zeros(properties):
|
|||
"category": pl.Series([], dtype=pl.String),
|
||||
}
|
||||
)
|
||||
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
|
||||
result = count_pois_per_postcode(postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0)
|
||||
|
||||
for group in POI_GROUPS:
|
||||
col = f"{group}_2km"
|
||||
assert col in result
|
||||
assert result[col].to_list() == [0, 0, 0]
|
||||
assert col in result.columns
|
||||
assert result[col].to_list() == [0, 0]
|
||||
|
||||
|
||||
def test_custom_radius(pois):
|
||||
"""A tiny radius should exclude POIs that are even slightly away."""
|
||||
properties = pl.DataFrame(
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["EC1A 1BB"],
|
||||
"lat": [51.5074],
|
||||
|
|
@ -85,7 +90,7 @@ def test_custom_radius(pois):
|
|||
)
|
||||
|
||||
# 0.01 km = 10m — only the POI at the exact same location should match
|
||||
result = count_pois_within_radius(properties, pois, radius_km=0.01)
|
||||
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=0.01)
|
||||
# The Restaurant at (51.5074, -0.1278) is at distance 0
|
||||
assert result["restaurants_0km"][0] >= 1
|
||||
# POIs >100m away should not be counted
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue