Working
This commit is contained in:
parent
14a3555cf1
commit
7e92bf112e
34 changed files with 1214437 additions and 224 deletions
|
|
@ -249,8 +249,18 @@ def _build(
|
|||
.when(has_epc)
|
||||
.then(pl.col("epc_property_type"))
|
||||
.otherwise(pl.col("pp_property_type"))
|
||||
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes"
|
||||
.replace({"Flat": "Flats/Maisonettes", "Maisonette": "Flats/Maisonettes"})
|
||||
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
|
||||
# collapse terrace sub-types, and fold rare types into "Other"
|
||||
.replace({
|
||||
"Flat": "Flats/Maisonettes",
|
||||
"Maisonette": "Flats/Maisonettes",
|
||||
"End-Terrace": "Terraced",
|
||||
"Mid-Terrace": "Terraced",
|
||||
"Enclosed End-Terrace": "Terraced",
|
||||
"Enclosed Mid-Terrace": "Terraced",
|
||||
"Bungalow": "Other",
|
||||
"Park home": "Other",
|
||||
})
|
||||
.alias("property_type")
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
|
||||
"""Count POIs within a radius of properties, optimised via postcode deduplication."""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
|
@ -6,6 +6,49 @@ import polars as pl
|
|||
from .haversine import haversine_km
|
||||
|
||||
|
||||
def _build_poi_grid(
|
||||
pois: pl.DataFrame, grid_size: float = 0.05
|
||||
) -> tuple[np.ndarray, np.ndarray, np.ndarray, dict]:
|
||||
"""Build spatial grid index for POIs. Returns (lats, lngs, cats, grid_dict)."""
|
||||
poi_lats = pois["lat"].to_numpy()
|
||||
poi_lngs = pois["lng"].to_numpy()
|
||||
poi_cats = pois["category"].to_numpy()
|
||||
|
||||
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
|
||||
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
|
||||
|
||||
poi_grid: dict[tuple[int, int], list[int]] = {}
|
||||
for i in range(len(pois)):
|
||||
key = (poi_grid_lats[i], poi_grid_lngs[i])
|
||||
if key not in poi_grid:
|
||||
poi_grid[key] = []
|
||||
poi_grid[key].append(i)
|
||||
|
||||
for key in poi_grid:
|
||||
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
|
||||
|
||||
return poi_lats, poi_lngs, poi_cats, poi_grid
|
||||
|
||||
|
||||
def _get_nearby_indices(
|
||||
pc_lat: float, pc_lon: float, poi_grid: dict, grid_size: float = 0.05
|
||||
) -> np.ndarray | None:
|
||||
"""Get POI indices from grid cells near the given coordinate."""
|
||||
grid_lat = int(np.floor(pc_lat / grid_size))
|
||||
grid_lng = int(np.floor(pc_lon / grid_size))
|
||||
|
||||
nearby_indices = []
|
||||
for dlat in [-1, 0, 1]:
|
||||
for dlng in [-1, 0, 1]:
|
||||
cell_key = (grid_lat + dlat, grid_lng + dlng)
|
||||
if cell_key in poi_grid:
|
||||
nearby_indices.append(poi_grid[cell_key])
|
||||
|
||||
if not nearby_indices:
|
||||
return None
|
||||
return np.concatenate(nearby_indices)
|
||||
|
||||
|
||||
def count_pois_per_postcode(
|
||||
postcodes_df: pl.DataFrame,
|
||||
pois: pl.DataFrame,
|
||||
|
|
@ -22,31 +65,9 @@ def count_pois_per_postcode(
|
|||
n_pois = len(pois)
|
||||
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
|
||||
|
||||
# Build spatial grid for POIs (0.05 degree cells ~5.5km)
|
||||
grid_size = 0.05
|
||||
print(" Building POI spatial grid...")
|
||||
|
||||
# Convert to numpy arrays
|
||||
poi_lats = pois["lat"].to_numpy()
|
||||
poi_lngs = pois["lng"].to_numpy()
|
||||
poi_cats = pois["category"].to_numpy()
|
||||
|
||||
# Compute grid coordinates for all POIs
|
||||
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
|
||||
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
|
||||
|
||||
# Build grid cell lookup using numpy indexing
|
||||
poi_grid = {}
|
||||
for i in range(n_pois):
|
||||
key = (poi_grid_lats[i], poi_grid_lngs[i])
|
||||
if key not in poi_grid:
|
||||
poi_grid[key] = []
|
||||
poi_grid[key].append(i)
|
||||
|
||||
# Convert grid values to numpy arrays for faster indexing
|
||||
for key in poi_grid:
|
||||
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
|
||||
|
||||
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
|
||||
print(f" POI grid has {len(poi_grid):,} occupied cells")
|
||||
|
||||
# Pre-compute category masks
|
||||
|
|
@ -81,38 +102,18 @@ def count_pois_per_postcode(
|
|||
|
||||
# Process batch
|
||||
for i in range(start_idx, end_idx):
|
||||
pc_lat = pc_lats[i]
|
||||
pc_lon = pc_lons[i]
|
||||
|
||||
# Find grid cells to check (3x3 grid)
|
||||
grid_lat = int(np.floor(pc_lat / grid_size))
|
||||
grid_lng = int(np.floor(pc_lon / grid_size))
|
||||
|
||||
# Collect nearby POI indices
|
||||
nearby_indices = []
|
||||
for dlat in [-1, 0, 1]:
|
||||
for dlng in [-1, 0, 1]:
|
||||
cell_key = (grid_lat + dlat, grid_lng + dlng)
|
||||
if cell_key in poi_grid:
|
||||
nearby_indices.append(poi_grid[cell_key])
|
||||
|
||||
if not nearby_indices:
|
||||
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
|
||||
if nearby is None:
|
||||
continue
|
||||
|
||||
# Concatenate all nearby POI indices
|
||||
nearby = np.concatenate(nearby_indices)
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
|
||||
|
||||
# Vectorized distance calculation for all nearby POIs
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
|
||||
|
||||
# Filter by radius
|
||||
within_mask = distances <= radius_km
|
||||
within_indices = nearby[within_mask]
|
||||
|
||||
if len(within_indices) == 0:
|
||||
continue
|
||||
|
||||
# Count by category group using pre-computed masks
|
||||
for group, cat_mask in category_masks.items():
|
||||
result_counts[group][i] = cat_mask[within_indices].sum()
|
||||
|
||||
|
|
@ -124,3 +125,71 @@ def count_pois_per_postcode(
|
|||
result = pl.DataFrame(result_data)
|
||||
print(" Completed POI counting")
|
||||
return result
|
||||
|
||||
|
||||
def min_distance_per_postcode(
|
||||
postcodes_df: pl.DataFrame,
|
||||
pois: pl.DataFrame,
|
||||
groups: dict[str, list[str]],
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
For each postcode, compute the distance (km) to the closest POI per group.
|
||||
Returns NaN where no POI of that group exists within the grid search range (~5.5km).
|
||||
"""
|
||||
print("Computing minimum POI distances per postcode...")
|
||||
|
||||
n_postcodes = len(postcodes_df)
|
||||
n_pois = len(pois)
|
||||
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
|
||||
|
||||
grid_size = 0.05
|
||||
print(" Building POI spatial grid...")
|
||||
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
|
||||
print(f" POI grid has {len(poi_grid):,} occupied cells")
|
||||
|
||||
category_masks = {}
|
||||
for group, categories in groups.items():
|
||||
mask = np.isin(poi_cats, categories)
|
||||
category_masks[group] = mask
|
||||
print(f" {group}: {mask.sum():,} POIs")
|
||||
|
||||
pc_lats = postcodes_df["lat"].to_numpy()
|
||||
pc_lons = postcodes_df["lon"].to_numpy()
|
||||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
|
||||
result_min_dist = {
|
||||
group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
|
||||
}
|
||||
|
||||
batch_size = 50000
|
||||
n_batches = (n_postcodes + batch_size - 1) // batch_size
|
||||
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
|
||||
|
||||
for batch_idx in range(n_batches):
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, n_postcodes)
|
||||
|
||||
if batch_idx % 5 == 0:
|
||||
print(
|
||||
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
|
||||
)
|
||||
|
||||
for i in range(start_idx, end_idx):
|
||||
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
|
||||
if nearby is None:
|
||||
continue
|
||||
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
|
||||
|
||||
for group, cat_mask in category_masks.items():
|
||||
group_mask = cat_mask[nearby]
|
||||
if group_mask.any():
|
||||
result_min_dist[group][i] = distances[group_mask].min()
|
||||
|
||||
result_data = {"postcode": pc_codes}
|
||||
for group in groups:
|
||||
result_data[f"{group}_nearest_km"] = result_min_dist[group]
|
||||
|
||||
result = pl.DataFrame(result_data)
|
||||
print(" Completed minimum distance computation")
|
||||
return result
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from scipy.spatial import cKDTree
|
|||
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
||||
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
||||
|
||||
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian meters,
|
||||
Uses OS National Grid coordinates (oseast1m, osnrth1m) which are Cartesian metres,
|
||||
so Euclidean distance via cKDTree gives accurate results without projection.
|
||||
"""
|
||||
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode
|
||||
|
||||
POI_GROUPS = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Supermarket"],
|
||||
"parks": ["Park"],
|
||||
"public_transport": ["Station"],
|
||||
"train_tube": ["Rail station", "Metro or Tram stop"],
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -23,7 +24,7 @@ def pois():
|
|||
"Fast Food",
|
||||
"Supermarket",
|
||||
"Park",
|
||||
"Station",
|
||||
"Rail station",
|
||||
"Restaurant", # too far from any property
|
||||
],
|
||||
}
|
||||
|
|
@ -55,7 +56,7 @@ def test_counts_pois_within_radius(postcodes, pois):
|
|||
assert ec1a["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||
assert ec1a["groceries_2km"][0] == 1 # Supermarket
|
||||
assert ec1a["parks_2km"][0] == 1 # Park
|
||||
assert ec1a["public_transport_2km"][0] == 1 # Station
|
||||
assert ec1a["train_tube_2km"][0] == 1 # Rail station
|
||||
|
||||
# Far-away postcode should have zero counts
|
||||
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
||||
|
|
@ -96,3 +97,35 @@ def test_custom_radius(pois):
|
|||
# POIs >100m away should not be counted
|
||||
total = sum(result[f"{g}_0km"][0] for g in POI_GROUPS)
|
||||
assert total <= 2 # at most the co-located POIs
|
||||
|
||||
|
||||
def test_min_distance_finds_nearest(postcodes, pois):
|
||||
"""min_distance_per_postcode returns distance to closest POI per group."""
|
||||
result = min_distance_per_postcode(postcodes, pois, groups=POI_GROUPS)
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
|
||||
# Rail station is at (51.5073, -0.1277), postcode at (51.5074, -0.1278) — very close
|
||||
assert ec1a["train_tube_nearest_km"][0] < 0.05 # within 50m
|
||||
# Restaurant is co-located — distance ~0
|
||||
assert ec1a["restaurants_nearest_km"][0] < 0.01
|
||||
|
||||
# Far-away postcode should have NaN (no POIs within grid range)
|
||||
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
||||
assert np.isnan(zz99["train_tube_nearest_km"][0])
|
||||
|
||||
|
||||
def test_min_distance_no_pois_returns_nan(postcodes):
|
||||
"""With no POIs, all distances should be NaN."""
|
||||
empty_pois = pl.DataFrame(
|
||||
{
|
||||
"lat": pl.Series([], dtype=pl.Float64),
|
||||
"lng": pl.Series([], dtype=pl.Float64),
|
||||
"category": pl.Series([], dtype=pl.String),
|
||||
}
|
||||
)
|
||||
result = min_distance_per_postcode(postcodes, empty_pois, groups={"train_tube": ["Rail station"]})
|
||||
|
||||
assert "train_tube_nearest_km" in result.columns
|
||||
assert all(np.isnan(v) for v in result["train_tube_nearest_km"].to_list())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue