Extarct utils
This commit is contained in:
parent
0153e46478
commit
e1b38a1b95
8 changed files with 458 additions and 25 deletions
192
pipeline/utils/poi_counts.py
Normal file
192
pipeline/utils/poi_counts.py
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from .haversine import haversine_km
|
||||
|
||||
# POI category groups for proximity counting
|
||||
POI_GROUPS = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
|
||||
"parks": ["Park", "Garden", "Nature Reserve"],
|
||||
"public_transport": ["Station", "Stop", "Bus Station"],
|
||||
}
|
||||
|
||||
|
||||
def _count_pois_per_postcode(
|
||||
postcodes_df: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
For each unique postcode, count POIs within radius_km by category group.
|
||||
Uses spatial grid with vectorized distance calculations.
|
||||
"""
|
||||
print(f"Counting POIs within {radius_km}km per postcode...")
|
||||
|
||||
n_postcodes = len(postcodes_df)
|
||||
n_pois = len(pois)
|
||||
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
|
||||
|
||||
# Build spatial grid for POIs (0.05 degree cells ~5.5km)
|
||||
grid_size = 0.05
|
||||
print(" Building POI spatial grid...")
|
||||
|
||||
# Convert to numpy arrays
|
||||
poi_lats = pois["lat"].to_numpy()
|
||||
poi_lngs = pois["lng"].to_numpy()
|
||||
poi_cats = pois["category"].to_numpy()
|
||||
|
||||
# Compute grid coordinates for all POIs
|
||||
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
|
||||
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
|
||||
|
||||
# Build grid cell lookup using numpy indexing
|
||||
poi_grid = {}
|
||||
for i in range(n_pois):
|
||||
key = (poi_grid_lats[i], poi_grid_lngs[i])
|
||||
if key not in poi_grid:
|
||||
poi_grid[key] = []
|
||||
poi_grid[key].append(i)
|
||||
|
||||
# Convert grid values to numpy arrays for faster indexing
|
||||
for key in poi_grid:
|
||||
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
|
||||
|
||||
print(f" POI grid has {len(poi_grid):,} occupied cells")
|
||||
|
||||
# Pre-compute category masks
|
||||
category_masks = {}
|
||||
for group, categories in POI_GROUPS.items():
|
||||
mask = np.isin(poi_cats, categories)
|
||||
category_masks[group] = mask
|
||||
print(f" {group}: {mask.sum():,} POIs")
|
||||
|
||||
# Extract postcode coordinates as numpy arrays
|
||||
pc_lats = postcodes_df["lat"].to_numpy()
|
||||
pc_lons = postcodes_df["lon"].to_numpy()
|
||||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
|
||||
# Initialize result arrays
|
||||
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS}
|
||||
|
||||
# Process in batches with progress
|
||||
batch_size = 50000
|
||||
n_batches = (n_postcodes + batch_size - 1) // batch_size
|
||||
|
||||
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
|
||||
|
||||
for batch_idx in range(n_batches):
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, n_postcodes)
|
||||
|
||||
if batch_idx % 5 == 0:
|
||||
print(f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}")
|
||||
|
||||
# Process batch
|
||||
for i in range(start_idx, end_idx):
|
||||
pc_lat = pc_lats[i]
|
||||
pc_lon = pc_lons[i]
|
||||
|
||||
# Find grid cells to check (3x3 grid)
|
||||
grid_lat = int(np.floor(pc_lat / grid_size))
|
||||
grid_lng = int(np.floor(pc_lon / grid_size))
|
||||
|
||||
# Collect nearby POI indices
|
||||
nearby_indices = []
|
||||
for dlat in [-1, 0, 1]:
|
||||
for dlng in [-1, 0, 1]:
|
||||
cell_key = (grid_lat + dlat, grid_lng + dlng)
|
||||
if cell_key in poi_grid:
|
||||
nearby_indices.append(poi_grid[cell_key])
|
||||
|
||||
if not nearby_indices:
|
||||
continue
|
||||
|
||||
# Concatenate all nearby POI indices
|
||||
nearby = np.concatenate(nearby_indices)
|
||||
|
||||
# Vectorized distance calculation for all nearby POIs
|
||||
distances = haversine_km(
|
||||
poi_lats[nearby],
|
||||
poi_lngs[nearby],
|
||||
pc_lat,
|
||||
pc_lon
|
||||
)
|
||||
|
||||
# Filter by radius
|
||||
within_mask = distances <= radius_km
|
||||
within_indices = nearby[within_mask]
|
||||
|
||||
if len(within_indices) == 0:
|
||||
continue
|
||||
|
||||
# Count by category group using pre-computed masks
|
||||
for group, cat_mask in category_masks.items():
|
||||
result_counts[group][i] = cat_mask[within_indices].sum()
|
||||
|
||||
# Build result dataframe
|
||||
result_data = {"postcode": pc_codes}
|
||||
for group in POI_GROUPS:
|
||||
result_data[f"{group}_{int(radius_km)}km"] = result_counts[group]
|
||||
|
||||
result = pl.DataFrame(result_data)
|
||||
print(" Completed POI counting")
|
||||
return result
|
||||
|
||||
|
||||
def count_pois_within_radius(
|
||||
properties: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
|
||||
) -> dict[str, pl.Series]:
|
||||
"""
|
||||
Count POIs within radius for properties, optimized by deduplicating postcodes.
|
||||
|
||||
Returns dict of {column_name: count_series} aligned to properties dataframe.
|
||||
"""
|
||||
# Get unique postcodes with coordinates
|
||||
print("Deduplicating postcodes...")
|
||||
unique_postcodes = (
|
||||
properties
|
||||
.select(["postcode", "lat", "lon"])
|
||||
.unique(subset=["postcode"])
|
||||
)
|
||||
|
||||
print(f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes")
|
||||
|
||||
# Count POIs per postcode
|
||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||
|
||||
# Write to temp file to avoid memory duplication during join
|
||||
print(" Writing postcode counts to temp file...")
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
postcode_counts.write_parquet(tmp_path)
|
||||
|
||||
del postcode_counts # Free memory
|
||||
|
||||
# Join using lazy evaluation
|
||||
print(" Joining counts back to properties (lazy)...")
|
||||
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
|
||||
|
||||
# Convert properties to lazy frame, join, then collect
|
||||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(
|
||||
pl.scan_parquet(tmp_path),
|
||||
on="postcode",
|
||||
how="left"
|
||||
)
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
|
||||
result_df = result_lazy.collect()
|
||||
|
||||
# Clean up temp file
|
||||
os.unlink(tmp_path)
|
||||
|
||||
# Extract as dict of Series
|
||||
return {col: result_df[col] for col in count_cols}
|
||||
Loading…
Add table
Add a link
Reference in a new issue