Rerun data pipelines
This commit is contained in:
parent
4c95815dc8
commit
fc10381692
27 changed files with 2143 additions and 215 deletions
|
|
@ -2,9 +2,12 @@
|
|||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
from .haversine import haversine_km
|
||||
|
||||
EARTH_RADIUS_KM = 6371.0088
|
||||
|
||||
|
||||
def _build_poi_grid(
|
||||
pois: pl.DataFrame, grid_size: float = 0.05
|
||||
|
|
@ -49,6 +52,21 @@ def _get_nearby_indices(
|
|||
return np.concatenate(nearby_indices)
|
||||
|
||||
|
||||
def _project_lat_lng_km(
|
||||
lats: np.ndarray, lngs: np.ndarray, origin_lat: float
|
||||
) -> np.ndarray:
|
||||
"""Project WGS84 coordinates to local km coordinates for nearest-neighbour lookup."""
|
||||
lat_rad = np.radians(lats)
|
||||
lng_rad = np.radians(lngs)
|
||||
origin_lat_rad = np.radians(origin_lat)
|
||||
return np.column_stack(
|
||||
(
|
||||
EARTH_RADIUS_KM * lng_rad * np.cos(origin_lat_rad),
|
||||
EARTH_RADIUS_KM * lat_rad,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def count_pois_per_postcode(
|
||||
postcodes_df: pl.DataFrame,
|
||||
pois: pl.DataFrame,
|
||||
|
|
@ -136,7 +154,7 @@ def min_distance_per_postcode(
|
|||
) -> pl.DataFrame:
|
||||
"""
|
||||
For each postcode, compute the distance (km) to the closest POI per group.
|
||||
Returns NaN where no POI of that group exists within the grid search range (~5.5km).
|
||||
Returns NaN where no POI of that group exists.
|
||||
"""
|
||||
print("Computing minimum POI distances per postcode...")
|
||||
|
||||
|
|
@ -144,51 +162,84 @@ def min_distance_per_postcode(
|
|||
n_pois = len(pois)
|
||||
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
|
||||
|
||||
grid_size = 0.05
|
||||
print(" Building POI spatial grid...")
|
||||
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
|
||||
print(f" POI grid has {len(poi_grid):,} occupied cells")
|
||||
|
||||
category_masks = {}
|
||||
for group, categories in groups.items():
|
||||
mask = np.isin(poi_cats, categories)
|
||||
category_masks[group] = mask
|
||||
print(f" {group}: {mask.sum():,} POIs")
|
||||
|
||||
pc_lats = postcodes_df["lat"].to_numpy()
|
||||
pc_lons = postcodes_df["lon"].to_numpy()
|
||||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
|
||||
valid_pc_indices = np.flatnonzero(valid_pc_mask)
|
||||
|
||||
result_min_dist = {
|
||||
group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
|
||||
}
|
||||
|
||||
batch_size = 50000
|
||||
n_batches = (n_postcodes + batch_size - 1) // batch_size
|
||||
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
|
||||
if n_pois == 0 or len(valid_pc_indices) == 0:
|
||||
print(" No valid postcode/POI coordinates; returning NaN distances")
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"postcode": pc_codes,
|
||||
**{
|
||||
f"{group}_nearest_km": values
|
||||
for group, values in result_min_dist.items()
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
for batch_idx in range(n_batches):
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, n_postcodes)
|
||||
poi_lats = pois["lat"].to_numpy()
|
||||
poi_lngs = pois["lng"].to_numpy()
|
||||
poi_cats = pois["category"].to_numpy()
|
||||
valid_poi_mask = np.isfinite(poi_lats) & np.isfinite(poi_lngs)
|
||||
origin_lat = float(np.nanmean(pc_lats[valid_pc_mask]))
|
||||
query_xy = _project_lat_lng_km(
|
||||
pc_lats[valid_pc_indices], pc_lons[valid_pc_indices], origin_lat
|
||||
)
|
||||
|
||||
if batch_idx % 5 == 0:
|
||||
print(
|
||||
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
|
||||
)
|
||||
batch_size = 200_000
|
||||
n_batches = (len(valid_pc_indices) + batch_size - 1) // batch_size
|
||||
|
||||
for i in range(start_idx, end_idx):
|
||||
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
|
||||
if nearby is None:
|
||||
continue
|
||||
for group, categories in groups.items():
|
||||
group_indices = np.flatnonzero(valid_poi_mask & np.isin(poi_cats, categories))
|
||||
print(f" {group}: {len(group_indices):,} POIs")
|
||||
if len(group_indices) == 0:
|
||||
continue
|
||||
|
||||
distances = haversine_km(
|
||||
poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
|
||||
)
|
||||
poi_xy = _project_lat_lng_km(
|
||||
poi_lats[group_indices], poi_lngs[group_indices], origin_lat
|
||||
)
|
||||
tree = cKDTree(poi_xy)
|
||||
k = min(8, len(group_indices))
|
||||
|
||||
for group, cat_mask in category_masks.items():
|
||||
group_mask = cat_mask[nearby]
|
||||
if group_mask.any():
|
||||
result_min_dist[group][i] = distances[group_mask].min()
|
||||
for batch_idx in range(n_batches):
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, len(valid_pc_indices))
|
||||
batch_pc_indices = valid_pc_indices[start_idx:end_idx]
|
||||
batch_xy = query_xy[start_idx:end_idx]
|
||||
|
||||
if batch_idx == 0 or (batch_idx + 1) % 5 == 0:
|
||||
print(
|
||||
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
|
||||
)
|
||||
|
||||
_, nearest = tree.query(batch_xy, k=k)
|
||||
nearest = np.asarray(nearest)
|
||||
|
||||
if k == 1:
|
||||
candidate_indices = group_indices[nearest]
|
||||
distances = haversine_km(
|
||||
poi_lats[candidate_indices],
|
||||
poi_lngs[candidate_indices],
|
||||
pc_lats[batch_pc_indices],
|
||||
pc_lons[batch_pc_indices],
|
||||
)
|
||||
else:
|
||||
candidate_indices = group_indices[nearest]
|
||||
distances = haversine_km(
|
||||
poi_lats[candidate_indices],
|
||||
poi_lngs[candidate_indices],
|
||||
pc_lats[batch_pc_indices, None],
|
||||
pc_lons[batch_pc_indices, None],
|
||||
).min(axis=1)
|
||||
|
||||
result_min_dist[group][batch_pc_indices] = distances.astype(np.float32)
|
||||
|
||||
result_data = {"postcode": pc_codes}
|
||||
for group in groups:
|
||||
|
|
|
|||
|
|
@ -113,9 +113,9 @@ def test_min_distance_finds_nearest(postcodes, pois):
|
|||
# Restaurant is co-located — distance ~0
|
||||
assert ec1a["restaurants_nearest_km"][0] < 0.01
|
||||
|
||||
# Far-away postcode should have NaN (no POIs within grid range)
|
||||
# Far-away postcode should still get the global nearest distance.
|
||||
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
||||
assert np.isnan(zz99["train_tube_nearest_km"][0])
|
||||
assert zz99["train_tube_nearest_km"][0] > 300
|
||||
|
||||
|
||||
def test_min_distance_no_pois_returns_nan(postcodes):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue