Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -1,9 +1,11 @@
import polars as pl
from pipeline.transform.poi_proximity import (
GREENSPACE_PARK_FUNCTIONS,
POI_GROUPS_2KM,
_build_poi_category_groups,
_dynamic_poi_metric_renames,
_greenspace_count_frame,
_groceries_categories,
)
from pipeline.utils.poi_counts import count_pois_per_postcode
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
"parks_2km": "Number of amenities (Park) within 2km",
"parks_5km": "Number of amenities (Park) within 5km",
}
def test_groceries_categories_exclude_speciality_food_retail() -> None:
"""The static groceries metric must not count bakeries/butchers/delis/
off-licences (speciality retail, ~a third of the group), while keeping
Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
pois = pl.DataFrame(
{
"category": [
"Tesco",
"Supermarket",
"Convenience Store",
"Greengrocer",
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
"Café",
],
"group": ["Groceries"] * 8 + ["Leisure"],
"lat": [51.5] * 9,
"lng": [-0.1] * 9,
}
)
assert _groceries_categories(pois) == [
"Convenience Store",
"Greengrocer",
"Supermarket",
"Tesco",
]
def test_park_group_excludes_playgrounds_and_play_space() -> None:
# "Play Space" (playgrounds) must not count as a Park; Public Park Or
# Garden and Playing Field (open recreation grounds) are in scope.
assert GREENSPACE_PARK_FUNCTIONS == {
"parks": ["Public Park Or Garden", "Playing Field"]
}
def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
# Three gates of one park (with a site centroid), one gate of another park
# without a centroid, and one centroid-fallback row with a null site_id.
greenspace = pl.DataFrame(
{
"lat": [51.50, 51.51, 51.52, 53.0, 54.0],
"lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
"category": ["Public Park Or Garden"] * 3
+ ["Playing Field", "Public Park Or Garden"],
"site_id": ["site-a", "site-a", "site-a", "site-b", None],
"site_lat": [51.505, 51.505, 51.505, None, None],
"site_lng": [-0.105, -0.105, -0.105, None, None],
}
)
result = _greenspace_count_frame(greenspace).sort("lat")
# One row per site (site-a collapses 3 → 1), null-site rows preserved.
assert result.height == 3
site_a = result.filter(pl.col("site_id") == "site-a")
# The representative point is the site centroid…
assert site_a["lat"].to_list() == [51.505]
assert site_a["lng"].to_list() == [-0.105]
# …or the first access point when no centroid is available.
site_b = result.filter(pl.col("site_id") == "site-b")
assert site_b["lat"].to_list() == [53.0]
def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
# The shipped parquet predates the site_id column; counting must not crash
# (it keeps the old access-point grain until regenerated).
legacy = pl.DataFrame(
{
"lat": [51.50, 51.51],
"lng": [-0.10, -0.11],
"category": ["Public Park Or Garden", "Play Space"],
}
)
assert _greenspace_count_frame(legacy).equals(legacy)