idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_

 # POI category groups for proximity counting (2km radius).
 # Names must match the friendly names produced by transform_poi.py / naptan.py.
+# "groceries" is filled in dynamically by _groceries_categories() because the
+# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
+# than the literal "Supermarket"; counting only the OSM strings here severely
+# understates the metric. See _groceries_categories below.
 POI_GROUPS_2KM = {
    "restaurants": ["Restaurant", "Fast Food"],
-    "groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
 }

+# POI group whose members are counted for the static "groceries" 2km metric.
+# Covers both the OSM grocery categories (Supermarket, Convenience Store,
+# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
+GROCERIES_GROUP = "Groceries"
+
 # OS Open Greenspace function types used for park counts and distance calculation.
 # Uses the authoritative OS dataset instead of OSM point POIs for better coverage
 # of green spaces that are only mapped as polygons in OSM.
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
    return slug or "poi"


+def _groceries_categories(pois: pl.DataFrame) -> list[str]:
+    """Return the distinct `category` values for the Groceries group.
+
+    `count_pois_per_postcode` matches POIs on `category`, but the authoritative
+    GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
+    with group "Groceries"; it never emits the literal "Supermarket". Collecting
+    every Groceries category captures both the OSM strings and the brand names.
+    """
+    if "group" not in pois.columns:
+        raise ValueError("POI dataframe must include a 'group' column")
+    return (
+        pois.filter(pl.col("group") == GROCERIES_GROUP)
+        .select("category")
+        .unique()
+        .sort("category")
+        .to_series()
+        .to_list()
+    )
+
+
 def _build_poi_category_groups(
    pois: pl.DataFrame,
 ) -> tuple[dict[str, list[str]], dict[str, str]]:
@ -122,9 +150,15 @@ def main():
    pois = pl.read_parquet(args.pois)
    poi_category_groups, poi_display_names = _build_poi_category_groups(pois)

-    # Count static amenity groups within 2km.
+    # Count static amenity groups within 2km. "groceries" is matched against
+    # every Groceries category (OSM strings + GEOLYTIX brand names) so that
+    # postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
+    groups_2km = {
+        **POI_GROUPS_2KM,
+        "groceries": _groceries_categories(pois),
+    }
    counts_2km = count_pois_per_postcode(
-        postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
+        postcodes, pois, groups=groups_2km, radius_km=2
    )

    # Dynamic amenity filters: nearest distance plus counts within 2km and 5km for