This commit is contained in:
Andras Schmelczer 2026-03-12 22:11:00 +00:00
parent 14a3555cf1
commit 7e92bf112e
34 changed files with 1214437 additions and 224 deletions

View file

@ -249,8 +249,18 @@ def _build(
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(pl.col("pp_property_type"))
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes"
.replace({"Flat": "Flats/Maisonettes", "Maisonette": "Flats/Maisonettes"})
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
# collapse terrace sub-types, and fold rare types into "Other"
.replace({
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
})
.alias("property_type")
)

View file

@ -1,4 +1,4 @@
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
"""Count POIs within a radius of properties, optimised via postcode deduplication."""
import numpy as np
import polars as pl
@ -6,6 +6,49 @@ import polars as pl
from .haversine import haversine_km
def _build_poi_grid(
pois: pl.DataFrame, grid_size: float = 0.05
) -> tuple[np.ndarray, np.ndarray, np.ndarray, dict]:
"""Build spatial grid index for POIs. Returns (lats, lngs, cats, grid_dict)."""
poi_lats = pois["lat"].to_numpy()
poi_lngs = pois["lng"].to_numpy()
poi_cats = pois["category"].to_numpy()
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
poi_grid: dict[tuple[int, int], list[int]] = {}
for i in range(len(pois)):
key = (poi_grid_lats[i], poi_grid_lngs[i])
if key not in poi_grid:
poi_grid[key] = []
poi_grid[key].append(i)
for key in poi_grid:
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
return poi_lats, poi_lngs, poi_cats, poi_grid
def _get_nearby_indices(
pc_lat: float, pc_lon: float, poi_grid: dict, grid_size: float = 0.05
) -> np.ndarray | None:
"""Get POI indices from grid cells near the given coordinate."""
grid_lat = int(np.floor(pc_lat / grid_size))
grid_lng = int(np.floor(pc_lon / grid_size))
nearby_indices = []
for dlat in [-1, 0, 1]:
for dlng in [-1, 0, 1]:
cell_key = (grid_lat + dlat, grid_lng + dlng)
if cell_key in poi_grid:
nearby_indices.append(poi_grid[cell_key])
if not nearby_indices:
return None
return np.concatenate(nearby_indices)
def count_pois_per_postcode(
postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
@ -22,31 +65,9 @@ def count_pois_per_postcode(
n_pois = len(pois)
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
# Build spatial grid for POIs (0.05 degree cells ~5.5km)
grid_size = 0.05
print(" Building POI spatial grid...")
# Convert to numpy arrays
poi_lats = pois["lat"].to_numpy()
poi_lngs = pois["lng"].to_numpy()
poi_cats = pois["category"].to_numpy()
# Compute grid coordinates for all POIs
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
# Build grid cell lookup using numpy indexing
poi_grid = {}
for i in range(n_pois):
key = (poi_grid_lats[i], poi_grid_lngs[i])
if key not in poi_grid:
poi_grid[key] = []
poi_grid[key].append(i)
# Convert grid values to numpy arrays for faster indexing
for key in poi_grid:
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
print(f" POI grid has {len(poi_grid):,} occupied cells")
# Pre-compute category masks
@ -81,38 +102,18 @@ def count_pois_per_postcode(
# Process batch
for i in range(start_idx, end_idx):
pc_lat = pc_lats[i]
pc_lon = pc_lons[i]
# Find grid cells to check (3x3 grid)
grid_lat = int(np.floor(pc_lat / grid_size))
grid_lng = int(np.floor(pc_lon / grid_size))
# Collect nearby POI indices
nearby_indices = []
for dlat in [-1, 0, 1]:
for dlng in [-1, 0, 1]:
cell_key = (grid_lat + dlat, grid_lng + dlng)
if cell_key in poi_grid:
nearby_indices.append(poi_grid[cell_key])
if not nearby_indices:
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
if nearby is None:
continue
# Concatenate all nearby POI indices
nearby = np.concatenate(nearby_indices)
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
# Vectorized distance calculation for all nearby POIs
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
# Filter by radius
within_mask = distances <= radius_km
within_indices = nearby[within_mask]
if len(within_indices) == 0:
continue
# Count by category group using pre-computed masks
for group, cat_mask in category_masks.items():
result_counts[group][i] = cat_mask[within_indices].sum()
@ -124,3 +125,71 @@ def count_pois_per_postcode(
result = pl.DataFrame(result_data)
print(" Completed POI counting")
return result
def min_distance_per_postcode(
postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
groups: dict[str, list[str]],
) -> pl.DataFrame:
"""
For each postcode, compute the distance (km) to the closest POI per group.
Returns NaN where no POI of that group exists within the grid search range (~5.5km).
"""
print("Computing minimum POI distances per postcode...")
n_postcodes = len(postcodes_df)
n_pois = len(pois)
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
grid_size = 0.05
print(" Building POI spatial grid...")
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
print(f" POI grid has {len(poi_grid):,} occupied cells")
category_masks = {}
for group, categories in groups.items():
mask = np.isin(poi_cats, categories)
category_masks[group] = mask
print(f" {group}: {mask.sum():,} POIs")
pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list()
result_min_dist = {
group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
}
batch_size = 50000
n_batches = (n_postcodes + batch_size - 1) // batch_size
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, n_postcodes)
if batch_idx % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
for i in range(start_idx, end_idx):
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
if nearby is None:
continue
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
for group, cat_mask in category_masks.items():
group_mask = cat_mask[nearby]
if group_mask.any():
result_min_dist[group][i] = distances[group_mask].min()
result_data = {"postcode": pc_codes}
for group in groups:
result_data[f"{group}_nearest_km"] = result_min_dist[group]
result = pl.DataFrame(result_data)
print(" Completed minimum distance computation")
return result

View file

@ -10,7 +10,7 @@ from scipy.spatial import cKDTree
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
"""Build a mapping from terminated England postcodes to their nearest active postcode.
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian meters,
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian metres,
so Euclidean distance via cKDTree gives accurate results without projection.
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")

View file

@ -1,13 +1,14 @@
import numpy as np
import polars as pl
import pytest
from pipeline.utils.poi_counts import count_pois_per_postcode
from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Supermarket"],
"parks": ["Park"],
"public_transport": ["Station"],
"train_tube": ["Rail station", "Metro or Tram stop"],
}
@ -23,7 +24,7 @@ def pois():
"Fast Food",
"Supermarket",
"Park",
"Station",
"Rail station",
"Restaurant", # too far from any property
],
}
@ -55,7 +56,7 @@ def test_counts_pois_within_radius(postcodes, pois):
assert ec1a["restaurants_2km"][0] == 2 # Restaurant + Fast Food
assert ec1a["groceries_2km"][0] == 1 # Supermarket
assert ec1a["parks_2km"][0] == 1 # Park
assert ec1a["public_transport_2km"][0] == 1 # Station
assert ec1a["train_tube_2km"][0] == 1 # Rail station
# Far-away postcode should have zero counts
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
@ -96,3 +97,35 @@ def test_custom_radius(pois):
# POIs >100m away should not be counted
total = sum(result[f"{g}_0km"][0] for g in POI_GROUPS)
assert total <= 2 # at most the co-located POIs
def test_min_distance_finds_nearest(postcodes, pois):
"""min_distance_per_postcode returns distance to closest POI per group."""
result = min_distance_per_postcode(postcodes, pois, groups=POI_GROUPS)
assert len(result) == 2
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
# Rail station is at (51.5073, -0.1277), postcode at (51.5074, -0.1278) — very close
assert ec1a["train_tube_nearest_km"][0] < 0.05 # within 50m
# Restaurant is co-located — distance ~0
assert ec1a["restaurants_nearest_km"][0] < 0.01
# Far-away postcode should have NaN (no POIs within grid range)
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
assert np.isnan(zz99["train_tube_nearest_km"][0])
def test_min_distance_no_pois_returns_nan(postcodes):
"""With no POIs, all distances should be NaN."""
empty_pois = pl.DataFrame(
{
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
"category": pl.Series([], dtype=pl.String),
}
)
result = min_distance_per_postcode(postcodes, empty_pois, groups={"train_tube": ["Rail station"]})
assert "train_tube_nearest_km" in result.columns
assert all(np.isnan(v) for v in result["train_tube_nearest_km"].to_list())