import polars as pl from pipeline.transform.poi_proximity import ( POI_GROUPS_2KM, _build_poi_category_groups, _dynamic_poi_metric_renames, _groceries_categories, ) from pipeline.utils.poi_counts import count_pois_per_postcode def test_groceries_2km_counts_geolytix_brand_categories() -> None: """The static groceries 2km count must include GEOLYTIX brand POIs. GEOLYTIX stores the brand (e.g. "Tesco") in `category` with group "Groceries" and never emits the literal "Supermarket"; matching only the OSM strings counts the supermarket but drops the brand store. """ postcodes = pl.DataFrame( { "postcode": ["SW1A 1AA"], "lat": [51.5010], "lon": [-0.1416], } ) pois = pl.DataFrame( { "category": ["Tesco", "Supermarket"], "group": ["Groceries", "Groceries"], "lat": [51.5011, 51.5012], "lng": [-0.1417, -0.1418], } ) groups_2km = {**POI_GROUPS_2KM, "groceries": _groceries_categories(pois)} result = count_pois_per_postcode(postcodes, pois, groups=groups_2km, radius_km=2) # Both the GEOLYTIX brand ("Tesco") and the OSM "Supermarket" must count. # Pre-fix the static list was ["Greengrocer", "Supermarket", "Convenience # Store"], so "Tesco" was dropped and this was 1. assert result["groceries_2km"][0] == 2 def test_dynamic_poi_groups_include_requested_categories_only() -> None: pois = pl.DataFrame( { "group": ( ["Public Transport"] * 2 + ["Leisure"] * 2 + ["Groceries"] * 101 + ["Groceries"] * 100 + ["Leisure"] * 10 + ["Education"] * 200 + ["Health"] * 200 ), "category": ( ["Rail station", "Bus stop"] + ["Café", "Restaurant"] + ["Tesco"] * 101 + ["Waitrose"] * 100 + ["Park"] * 10 + ["School"] * 200 + ["Pharmacy"] * 200 ), "lat": [51.5] * 615, "lng": [-0.1] * 615, } ) groups, display_names = _build_poi_category_groups(pois) assert set(display_names.values()) == { "Bus stop", "Café", "Pharmacy", "Rail station", "Restaurant", "Tesco", } assert "poi_waitrose" not in groups assert "poi_park" not in groups assert "poi_school" not in groups def test_dynamic_poi_metric_renames_support_park_count_options() -> None: assert _dynamic_poi_metric_renames({"parks": "Park"}) == { "parks_nearest_km": "Distance to nearest amenity (Park) (km)", "parks_2km": "Number of amenities (Park) within 2km", "parks_5km": "Number of amenities (Park) within 5km", }