135 lines
4.4 KiB
Python
135 lines
4.4 KiB
Python
import numpy as np
|
|
import polars as pl
|
|
import pytest
|
|
|
|
from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode
|
|
|
|
POI_GROUPS = {
|
|
"restaurants": ["Restaurant", "Fast Food"],
|
|
"groceries": ["Supermarket"],
|
|
"parks": ["Park"],
|
|
"train_tube": ["Rail station", "Metro or Tram stop"],
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def pois():
|
|
"""POIs clustered around two locations: central London and 10km away."""
|
|
return pl.DataFrame(
|
|
{
|
|
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
|
|
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
|
|
"category": [
|
|
"Restaurant",
|
|
"Fast Food",
|
|
"Supermarket",
|
|
"Park",
|
|
"Rail station",
|
|
"Restaurant", # too far from any property
|
|
],
|
|
}
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def postcodes():
|
|
"""Two postcodes: one near central London, one far away."""
|
|
return pl.DataFrame(
|
|
{
|
|
"postcode": ["EC1A 1BB", "ZZ99 9ZZ"],
|
|
"lat": [51.5074, 55.0],
|
|
"lon": [-0.1278, -3.0],
|
|
}
|
|
)
|
|
|
|
|
|
def test_counts_pois_within_radius(postcodes, pois):
|
|
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2.0)
|
|
|
|
expected_cols = {f"{g}_2km" for g in POI_GROUPS}
|
|
assert expected_cols.issubset(set(result.columns))
|
|
|
|
# Result must be aligned to postcodes (2 rows)
|
|
assert len(result) == 2
|
|
|
|
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
|
|
assert ec1a["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
|
assert ec1a["groceries_2km"][0] == 1 # Supermarket
|
|
assert ec1a["parks_2km"][0] == 1 # Park
|
|
assert ec1a["train_tube_2km"][0] == 1 # Rail station
|
|
|
|
# Far-away postcode should have zero counts
|
|
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
|
for group in POI_GROUPS:
|
|
assert zz99[f"{group}_2km"][0] == 0
|
|
|
|
|
|
def test_no_pois_returns_zeros(postcodes):
|
|
empty_pois = pl.DataFrame(
|
|
{
|
|
"lat": pl.Series([], dtype=pl.Float64),
|
|
"lng": pl.Series([], dtype=pl.Float64),
|
|
"category": pl.Series([], dtype=pl.String),
|
|
}
|
|
)
|
|
result = count_pois_per_postcode(
|
|
postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0
|
|
)
|
|
|
|
for group in POI_GROUPS:
|
|
col = f"{group}_2km"
|
|
assert col in result.columns
|
|
assert result[col].to_list() == [0, 0]
|
|
|
|
|
|
def test_custom_radius(pois):
|
|
"""A tiny radius should exclude POIs that are even slightly away."""
|
|
postcodes = pl.DataFrame(
|
|
{
|
|
"postcode": ["EC1A 1BB"],
|
|
"lat": [51.5074],
|
|
"lon": [-0.1278],
|
|
}
|
|
)
|
|
|
|
# 0.01 km = 10m — only the POI at the exact same location should match
|
|
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=0.01)
|
|
# The Restaurant at (51.5074, -0.1278) is at distance 0
|
|
assert result["restaurants_0km"][0] >= 1
|
|
# POIs >100m away should not be counted
|
|
total = sum(result[f"{g}_0km"][0] for g in POI_GROUPS)
|
|
assert total <= 2 # at most the co-located POIs
|
|
|
|
|
|
def test_min_distance_finds_nearest(postcodes, pois):
|
|
"""min_distance_per_postcode returns distance to closest POI per group."""
|
|
result = min_distance_per_postcode(postcodes, pois, groups=POI_GROUPS)
|
|
|
|
assert len(result) == 2
|
|
|
|
ec1a = result.filter(pl.col("postcode") == "EC1A 1BB")
|
|
# Rail station is at (51.5073, -0.1277), postcode at (51.5074, -0.1278) — very close
|
|
assert ec1a["train_tube_nearest_km"][0] < 0.05 # within 50m
|
|
# Restaurant is co-located — distance ~0
|
|
assert ec1a["restaurants_nearest_km"][0] < 0.01
|
|
|
|
# Far-away postcode should have NaN (no POIs within grid range)
|
|
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
|
|
assert np.isnan(zz99["train_tube_nearest_km"][0])
|
|
|
|
|
|
def test_min_distance_no_pois_returns_nan(postcodes):
|
|
"""With no POIs, all distances should be NaN."""
|
|
empty_pois = pl.DataFrame(
|
|
{
|
|
"lat": pl.Series([], dtype=pl.Float64),
|
|
"lng": pl.Series([], dtype=pl.Float64),
|
|
"category": pl.Series([], dtype=pl.String),
|
|
}
|
|
)
|
|
result = min_distance_per_postcode(
|
|
postcodes, empty_pois, groups={"train_tube": ["Rail station"]}
|
|
)
|
|
|
|
assert "train_tube_nearest_km" in result.columns
|
|
assert all(np.isnan(v) for v in result["train_tube_nearest_km"].to_list())
|