Rerun data pipelines

This commit is contained in:
Andras Schmelczer 2026-05-10 14:49:53 +01:00
parent 4c95815dc8
commit fc10381692
27 changed files with 2143 additions and 215 deletions

View file

@ -2,9 +2,12 @@
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
from .haversine import haversine_km
EARTH_RADIUS_KM = 6371.0088
def _build_poi_grid(
pois: pl.DataFrame, grid_size: float = 0.05
@ -49,6 +52,21 @@ def _get_nearby_indices(
return np.concatenate(nearby_indices)
def _project_lat_lng_km(
lats: np.ndarray, lngs: np.ndarray, origin_lat: float
) -> np.ndarray:
"""Project WGS84 coordinates to local km coordinates for nearest-neighbour lookup."""
lat_rad = np.radians(lats)
lng_rad = np.radians(lngs)
origin_lat_rad = np.radians(origin_lat)
return np.column_stack(
(
EARTH_RADIUS_KM * lng_rad * np.cos(origin_lat_rad),
EARTH_RADIUS_KM * lat_rad,
)
)
def count_pois_per_postcode(
postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
@ -136,7 +154,7 @@ def min_distance_per_postcode(
) -> pl.DataFrame:
"""
For each postcode, compute the distance (km) to the closest POI per group.
Returns NaN where no POI of that group exists within the grid search range (~5.5km).
Returns NaN where no POI of that group exists.
"""
print("Computing minimum POI distances per postcode...")
@ -144,51 +162,84 @@ def min_distance_per_postcode(
n_pois = len(pois)
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
grid_size = 0.05
print(" Building POI spatial grid...")
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
print(f" POI grid has {len(poi_grid):,} occupied cells")
category_masks = {}
for group, categories in groups.items():
mask = np.isin(poi_cats, categories)
category_masks[group] = mask
print(f" {group}: {mask.sum():,} POIs")
pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list()
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
valid_pc_indices = np.flatnonzero(valid_pc_mask)
result_min_dist = {
group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
}
batch_size = 50000
n_batches = (n_postcodes + batch_size - 1) // batch_size
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
if n_pois == 0 or len(valid_pc_indices) == 0:
print(" No valid postcode/POI coordinates; returning NaN distances")
return pl.DataFrame(
{
"postcode": pc_codes,
**{
f"{group}_nearest_km": values
for group, values in result_min_dist.items()
},
}
)
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, n_postcodes)
poi_lats = pois["lat"].to_numpy()
poi_lngs = pois["lng"].to_numpy()
poi_cats = pois["category"].to_numpy()
valid_poi_mask = np.isfinite(poi_lats) & np.isfinite(poi_lngs)
origin_lat = float(np.nanmean(pc_lats[valid_pc_mask]))
query_xy = _project_lat_lng_km(
pc_lats[valid_pc_indices], pc_lons[valid_pc_indices], origin_lat
)
if batch_idx % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
batch_size = 200_000
n_batches = (len(valid_pc_indices) + batch_size - 1) // batch_size
for i in range(start_idx, end_idx):
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
if nearby is None:
continue
for group, categories in groups.items():
group_indices = np.flatnonzero(valid_poi_mask & np.isin(poi_cats, categories))
print(f" {group}: {len(group_indices):,} POIs")
if len(group_indices) == 0:
continue
distances = haversine_km(
poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
)
poi_xy = _project_lat_lng_km(
poi_lats[group_indices], poi_lngs[group_indices], origin_lat
)
tree = cKDTree(poi_xy)
k = min(8, len(group_indices))
for group, cat_mask in category_masks.items():
group_mask = cat_mask[nearby]
if group_mask.any():
result_min_dist[group][i] = distances[group_mask].min()
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(valid_pc_indices))
batch_pc_indices = valid_pc_indices[start_idx:end_idx]
batch_xy = query_xy[start_idx:end_idx]
if batch_idx == 0 or (batch_idx + 1) % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
_, nearest = tree.query(batch_xy, k=k)
nearest = np.asarray(nearest)
if k == 1:
candidate_indices = group_indices[nearest]
distances = haversine_km(
poi_lats[candidate_indices],
poi_lngs[candidate_indices],
pc_lats[batch_pc_indices],
pc_lons[batch_pc_indices],
)
else:
candidate_indices = group_indices[nearest]
distances = haversine_km(
poi_lats[candidate_indices],
poi_lngs[candidate_indices],
pc_lats[batch_pc_indices, None],
pc_lons[batch_pc_indices, None],
).min(axis=1)
result_min_dist[group][batch_pc_indices] = distances.astype(np.float32)
result_data = {"postcode": pc_codes}
for group in groups:

View file

@ -113,9 +113,9 @@ def test_min_distance_finds_nearest(postcodes, pois):
# Restaurant is co-located — distance ~0
assert ec1a["restaurants_nearest_km"][0] < 0.01
# Far-away postcode should have NaN (no POIs within grid range)
# Far-away postcode should still get the global nearest distance.
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
assert np.isnan(zz99["train_tube_nearest_km"][0])
assert zz99["train_tube_nearest_km"][0] > 300
def test_min_distance_no_pois_returns_nan(postcodes):