Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -1,9 +1,11 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import (
|
||||
GREENSPACE_PARK_FUNCTIONS,
|
||||
POI_GROUPS_2KM,
|
||||
_build_poi_category_groups,
|
||||
_dynamic_poi_metric_renames,
|
||||
_greenspace_count_frame,
|
||||
_groceries_categories,
|
||||
)
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
|
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
|
|||
"parks_2km": "Number of amenities (Park) within 2km",
|
||||
"parks_5km": "Number of amenities (Park) within 5km",
|
||||
}
|
||||
|
||||
|
||||
def test_groceries_categories_exclude_speciality_food_retail() -> None:
|
||||
"""The static groceries metric must not count bakeries/butchers/delis/
|
||||
off-licences (speciality retail, ~a third of the group), while keeping
|
||||
Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"category": [
|
||||
"Tesco",
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Greengrocer",
|
||||
"Bakery",
|
||||
"Butcher & Fishmonger",
|
||||
"Deli & Specialty",
|
||||
"Off-Licence",
|
||||
"Café",
|
||||
],
|
||||
"group": ["Groceries"] * 8 + ["Leisure"],
|
||||
"lat": [51.5] * 9,
|
||||
"lng": [-0.1] * 9,
|
||||
}
|
||||
)
|
||||
|
||||
assert _groceries_categories(pois) == [
|
||||
"Convenience Store",
|
||||
"Greengrocer",
|
||||
"Supermarket",
|
||||
"Tesco",
|
||||
]
|
||||
|
||||
|
||||
def test_park_group_excludes_playgrounds_and_play_space() -> None:
|
||||
# "Play Space" (playgrounds) must not count as a Park; Public Park Or
|
||||
# Garden and Playing Field (open recreation grounds) are in scope.
|
||||
assert GREENSPACE_PARK_FUNCTIONS == {
|
||||
"parks": ["Public Park Or Garden", "Playing Field"]
|
||||
}
|
||||
|
||||
|
||||
def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
|
||||
# Three gates of one park (with a site centroid), one gate of another park
|
||||
# without a centroid, and one centroid-fallback row with a null site_id.
|
||||
greenspace = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.50, 51.51, 51.52, 53.0, 54.0],
|
||||
"lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
|
||||
"category": ["Public Park Or Garden"] * 3
|
||||
+ ["Playing Field", "Public Park Or Garden"],
|
||||
"site_id": ["site-a", "site-a", "site-a", "site-b", None],
|
||||
"site_lat": [51.505, 51.505, 51.505, None, None],
|
||||
"site_lng": [-0.105, -0.105, -0.105, None, None],
|
||||
}
|
||||
)
|
||||
|
||||
result = _greenspace_count_frame(greenspace).sort("lat")
|
||||
|
||||
# One row per site (site-a collapses 3 → 1), null-site rows preserved.
|
||||
assert result.height == 3
|
||||
site_a = result.filter(pl.col("site_id") == "site-a")
|
||||
# The representative point is the site centroid…
|
||||
assert site_a["lat"].to_list() == [51.505]
|
||||
assert site_a["lng"].to_list() == [-0.105]
|
||||
# …or the first access point when no centroid is available.
|
||||
site_b = result.filter(pl.col("site_id") == "site-b")
|
||||
assert site_b["lat"].to_list() == [53.0]
|
||||
|
||||
|
||||
def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
|
||||
# The shipped parquet predates the site_id column; counting must not crash
|
||||
# (it keeps the old access-point grain until regenerated).
|
||||
legacy = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.50, 51.51],
|
||||
"lng": [-0.10, -0.11],
|
||||
"category": ["Public Park Or Garden", "Play Space"],
|
||||
}
|
||||
)
|
||||
|
||||
assert _greenspace_count_frame(legacy).equals(legacy)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue