This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_
# POI category groups for proximity counting (2km radius).
# Names must match the friendly names produced by transform_poi.py / naptan.py.
# "groceries" is filled in dynamically by _groceries_categories() because the
# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
# than the literal "Supermarket"; counting only the OSM strings here severely
# understates the metric. See _groceries_categories below.
POI_GROUPS_2KM = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# POI group whose members are counted for the static "groceries" 2km metric.
# Covers both the OSM grocery categories (Supermarket, Convenience Store,
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
GROCERIES_GROUP = "Groceries"
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
return slug or "poi"
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
"""Return the distinct `category` values for the Groceries group.
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
with group "Groceries"; it never emits the literal "Supermarket". Collecting
every Groceries category captures both the OSM strings and the brand names.
"""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
return (
pois.filter(pl.col("group") == GROCERIES_GROUP)
.select("category")
.unique()
.sort("category")
.to_series()
.to_list()
)
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
@ -122,9 +150,15 @@ def main():
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count static amenity groups within 2km.
# Count static amenity groups within 2km. "groceries" is matched against
# every Groceries category (OSM strings + GEOLYTIX brand names) so that
# postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
groups_2km = {
**POI_GROUPS_2KM,
"groceries": _groceries_categories(pois),
}
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
postcodes, pois, groups=groups_2km, radius_km=2
)
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for