Rerun data pipelines

2026-05-10 14:49:53 +01:00 · 2026-05-10 14:49:53 +01:00 · fc10381692
commit fc10381692
parent 4c95815dc8
27 changed files with 2143 additions and 215 deletions
--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -2,9 +2,12 @@

 import numpy as np
 import polars as pl
+from scipy.spatial import cKDTree

 from .haversine import haversine_km

+EARTH_RADIUS_KM = 6371.0088
+

 def _build_poi_grid(
    pois: pl.DataFrame, grid_size: float = 0.05
@ -49,6 +52,21 @@ def _get_nearby_indices(
    return np.concatenate(nearby_indices)


+def _project_lat_lng_km(
+    lats: np.ndarray, lngs: np.ndarray, origin_lat: float
+) -> np.ndarray:
+    """Project WGS84 coordinates to local km coordinates for nearest-neighbour lookup."""
+    lat_rad = np.radians(lats)
+    lng_rad = np.radians(lngs)
+    origin_lat_rad = np.radians(origin_lat)
+    return np.column_stack(
+        (
+            EARTH_RADIUS_KM * lng_rad * np.cos(origin_lat_rad),
+            EARTH_RADIUS_KM * lat_rad,
+        )
+    )
+
+
 def count_pois_per_postcode(
    postcodes_df: pl.DataFrame,
    pois: pl.DataFrame,
@ -136,7 +154,7 @@ def min_distance_per_postcode(
 ) -> pl.DataFrame:
    """
    For each postcode, compute the distance (km) to the closest POI per group.
-    Returns NaN where no POI of that group exists within the grid search range (~5.5km).
+    Returns NaN where no POI of that group exists.
    """
    print("Computing minimum POI distances per postcode...")

@ -144,51 +162,84 @@ def min_distance_per_postcode(
    n_pois = len(pois)
    print(f"  {n_postcodes:,} postcodes, {n_pois:,} POIs")

-    grid_size = 0.05
-    print("  Building POI spatial grid...")
-    poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
-    print(f"  POI grid has {len(poi_grid):,} occupied cells")
-
-    category_masks = {}
-    for group, categories in groups.items():
-        mask = np.isin(poi_cats, categories)
-        category_masks[group] = mask
-        print(f"  {group}: {mask.sum():,} POIs")
-
    pc_lats = postcodes_df["lat"].to_numpy()
    pc_lons = postcodes_df["lon"].to_numpy()
    pc_codes = postcodes_df["postcode"].to_list()
+    valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
+    valid_pc_indices = np.flatnonzero(valid_pc_mask)

    result_min_dist = {
        group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
    }

-    batch_size = 50000
-    n_batches = (n_postcodes + batch_size - 1) // batch_size
-    print(f"  Processing {n_postcodes:,} postcodes in {n_batches} batches...")
+    if n_pois == 0 or len(valid_pc_indices) == 0:
+        print("  No valid postcode/POI coordinates; returning NaN distances")
+        return pl.DataFrame(
+            {
+                "postcode": pc_codes,
+                **{
+                    f"{group}_nearest_km": values
+                    for group, values in result_min_dist.items()
+                },
+            }
+        )

-    for batch_idx in range(n_batches):
-        start_idx = batch_idx * batch_size
-        end_idx = min(start_idx + batch_size, n_postcodes)
+    poi_lats = pois["lat"].to_numpy()
+    poi_lngs = pois["lng"].to_numpy()
+    poi_cats = pois["category"].to_numpy()
+    valid_poi_mask = np.isfinite(poi_lats) & np.isfinite(poi_lngs)
+    origin_lat = float(np.nanmean(pc_lats[valid_pc_mask]))
+    query_xy = _project_lat_lng_km(
+        pc_lats[valid_pc_indices], pc_lons[valid_pc_indices], origin_lat
+    )

-        if batch_idx % 5 == 0:
-            print(
-                f"  Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
-            )
+    batch_size = 200_000
+    n_batches = (len(valid_pc_indices) + batch_size - 1) // batch_size

-        for i in range(start_idx, end_idx):
-            nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
-            if nearby is None:
-                continue
+    for group, categories in groups.items():
+        group_indices = np.flatnonzero(valid_poi_mask & np.isin(poi_cats, categories))
+        print(f"  {group}: {len(group_indices):,} POIs")
+        if len(group_indices) == 0:
+            continue

-            distances = haversine_km(
-                poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
-            )
+        poi_xy = _project_lat_lng_km(
+            poi_lats[group_indices], poi_lngs[group_indices], origin_lat
+        )
+        tree = cKDTree(poi_xy)
+        k = min(8, len(group_indices))

-            for group, cat_mask in category_masks.items():
-                group_mask = cat_mask[nearby]
-                if group_mask.any():
-                    result_min_dist[group][i] = distances[group_mask].min()
+        for batch_idx in range(n_batches):
+            start_idx = batch_idx * batch_size
+            end_idx = min(start_idx + batch_size, len(valid_pc_indices))
+            batch_pc_indices = valid_pc_indices[start_idx:end_idx]
+            batch_xy = query_xy[start_idx:end_idx]
+
+            if batch_idx == 0 or (batch_idx + 1) % 5 == 0:
+                print(
+                    f"    Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
+                )
+
+            _, nearest = tree.query(batch_xy, k=k)
+            nearest = np.asarray(nearest)
+
+            if k == 1:
+                candidate_indices = group_indices[nearest]
+                distances = haversine_km(
+                    poi_lats[candidate_indices],
+                    poi_lngs[candidate_indices],
+                    pc_lats[batch_pc_indices],
+                    pc_lons[batch_pc_indices],
+                )
+            else:
+                candidate_indices = group_indices[nearest]
+                distances = haversine_km(
+                    poi_lats[candidate_indices],
+                    poi_lngs[candidate_indices],
+                    pc_lats[batch_pc_indices, None],
+                    pc_lons[batch_pc_indices, None],
+                ).min(axis=1)
+
+            result_min_dist[group][batch_pc_indices] = distances.astype(np.float32)

    result_data = {"postcode": pc_codes}
    for group in groups:
--- a/pipeline/utils/test_poi_counts.py
+++ b/pipeline/utils/test_poi_counts.py
@ -113,9 +113,9 @@ def test_min_distance_finds_nearest(postcodes, pois):
    # Restaurant is co-located — distance ~0
    assert ec1a["restaurants_nearest_km"][0] < 0.01

-    # Far-away postcode should have NaN (no POIs within grid range)
+    # Far-away postcode should still get the global nearest distance.
    zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
-    assert np.isnan(zz99["train_tube_nearest_km"][0])
+    assert zz99["train_tube_nearest_km"][0] > 300


 def test_min_distance_no_pois_returns_nan(postcodes):