"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs.""" import argparse import re import unicodedata from pathlib import Path import polars as pl from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode # POI category groups for proximity counting (2km radius). # Names must match the friendly names produced by transform_poi.py / naptan.py. # "groceries" is filled in dynamically by _groceries_categories() because the # GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather # than the literal "Supermarket"; counting only the OSM strings here severely # understates the metric. See _groceries_categories below. POI_GROUPS_2KM = { "restaurants": ["Restaurant", "Fast Food"], } # POI group whose members are counted for the static "groceries" 2km metric. # Covers both the OSM grocery categories (Supermarket, Convenience Store, # Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...). GROCERIES_GROUP = "Groceries" # Groceries categories EXCLUDED from the static "Number of grocery shops and # supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences # are speciality food retail, not somewhere you do a grocery shop; together # they were ~a third of the group and inflated the headline count. The metric # keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand. GROCERY_STATIC_EXCLUDED_CATEGORIES = { "Bakery", "Butcher & Fishmonger", "Deli & Specialty", "Off-Licence", } # OS Open Greenspace function types used for park counts and distance calculation. # Uses the authoritative OS dataset instead of OSM point POIs for better coverage # of green spaces that are only mapped as polygons in OSM. # Scope: "Public Park Or Garden" is the core park function. "Playing Field" # (open public recreation grounds) is borderline but kept: outside big cities # the local rec ground is the de facto park. "Play Space" (playgrounds) is # excluded — a playground is not a park, and "Playground" is already its own # OSM-derived category. The remaining functions (Religious Grounds, Golf # Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports # Facility) are clearly not parks. GREENSPACE_PARK_FUNCTIONS = { "parks": ["Public Park Or Garden", "Playing Field"], } GROCERY_DYNAMIC_FILTER_MIN_POIS = 100 DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure", "Health"} DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"} DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"} def _poi_category_slug(category: str) -> str: ascii_text = ( unicodedata.normalize("NFKD", category) .encode("ascii", "ignore") .decode("ascii") .lower() ) slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_") return slug or "poi" def _groceries_categories(pois: pl.DataFrame) -> list[str]: """Return the distinct `category` values for the static groceries metric. `count_pois_per_postcode` matches POIs on `category`, but the authoritative GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi") with group "Groceries"; it never emits the literal "Supermarket". Collecting every Groceries category captures both the OSM strings and the brand names. Speciality food retail (bakeries, butchers, delis, off-licences) is excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES. """ if "group" not in pois.columns: raise ValueError("POI dataframe must include a 'group' column") return ( pois.filter( (pl.col("group") == GROCERIES_GROUP) & ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES)) ) .select("category") .unique() .sort("category") .to_series() .to_list() ) def _build_poi_category_groups( pois: pl.DataFrame, ) -> tuple[dict[str, list[str]], dict[str, str]]: """Build one proximity group for each POI category selected for filters.""" if "group" not in pois.columns: raise ValueError("POI dataframe must include a 'group' column") categories = ( pois.group_by("group", "category") .len() .filter( pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS)) | ( pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS)) & (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS) ) ) .filter(~pl.col("category").is_in(list(DYNAMIC_FILTER_EXCLUDED_CATEGORIES))) .select("category") .sort("category") .to_series() .to_list() ) used_slugs: dict[str, int] = {} groups: dict[str, list[str]] = {} display_names: dict[str, str] = {} for category in categories: if not isinstance(category, str) or not category: continue base_slug = f"poi_{_poi_category_slug(category)}" slug_count = used_slugs.get(base_slug, 0) used_slugs[base_slug] = slug_count + 1 group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}" groups[group_key] = [category] display_names[group_key] = category return groups, display_names def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame: """Collapse the greenspace frame to ONE representative row per site. os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the right grain for nearest-distance (the nearest gate is what matters) but wildly over-counts "Number of amenities (Park) within Xkm" — a large park with 30 gates counted as 30 parks. Counting uses one row per site at the site centroid (falling back to the first access point when no centroid is available). Degrades gracefully: a legacy parquet without `site_id` is returned unchanged (gate-grain counts) rather than crashing. """ if "site_id" not in greenspace.columns: print( "WARNING: greenspace parquet has no site_id column; park counts " "will count access points, not sites (regenerate os_greenspace)" ) return greenspace keyed = greenspace.filter(pl.col("site_id").is_not_null()) unkeyed = greenspace.filter(pl.col("site_id").is_null()) representatives = keyed.unique(subset=["site_id"], keep="first") if {"site_lat", "site_lng"}.issubset(greenspace.columns): representatives = representatives.with_columns( pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"), pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"), ) frames = [representatives.select(greenspace.columns)] if len(unkeyed) > 0: frames.append(unkeyed) return pl.concat(frames) def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]: renames: dict[str, str] = {} for group_key, category in display_names.items(): renames[f"{group_key}_nearest_km"] = ( f"Distance to nearest amenity ({category}) (km)" ) renames[f"{group_key}_2km"] = f"Number of amenities ({category}) within 2km" renames[f"{group_key}_5km"] = f"Number of amenities ({category}) within 5km" return renames def main(): parser = argparse.ArgumentParser( description="Count POIs within radius per postcode" ) parser.add_argument( "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet" ) parser.add_argument( "--pois", type=Path, required=True, help="Filtered POIs parquet" ) parser.add_argument( "--greenspace", type=Path, required=True, help="OS Open Greenspace centroids parquet", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet path" ) args = parser.parse_args() postcodes = pl.read_parquet(args.arcgis).select( pl.col("pcds").alias("postcode"), "lat", pl.col("long").alias("lon"), ) pois = pl.read_parquet(args.pois) poi_category_groups, poi_display_names = _build_poi_category_groups(pois) # Count static amenity groups within 2km. "groceries" is matched against # every Groceries category (OSM strings + GEOLYTIX brand names) so that # postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted. groups_2km = { **POI_GROUPS_2KM, "groceries": _groceries_categories(pois), } counts_2km = count_pois_per_postcode( postcodes, pois, groups=groups_2km, radius_km=2 ) # Dynamic amenity filters: nearest distance plus counts within 2km and 5km for # the selected public transport, grocery, and leisure categories. dynamic_counts_2km = count_pois_per_postcode( postcodes, pois, groups=poi_category_groups, radius_km=2 ) dynamic_counts_5km = count_pois_per_postcode( postcodes, pois, groups=poi_category_groups, radius_km=5 ) dynamic_distances = min_distance_per_postcode( postcodes, pois, groups=poi_category_groups ) dynamic_renames = _dynamic_poi_metric_renames(poi_display_names) dynamic_counts_2km = dynamic_counts_2km.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns} ) dynamic_counts_5km = dynamic_counts_5km.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns} ) dynamic_distances = dynamic_distances.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns} ) # Park counts and distances from OS Open Greenspace. They use the dynamic # amenity metric names so filters read through the same side-table path as # OSM-derived amenity metrics. Distances use the access-point grain (the # nearest park GATE is the right semantics); counts use one row per SITE so # a park with many gates counts once. greenspace = pl.read_parquet(args.greenspace) greenspace_sites = _greenspace_count_frame(greenspace) park_counts_2km = count_pois_per_postcode( postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2 ) park_counts_5km = count_pois_per_postcode( postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5 ) park_distances = min_distance_per_postcode( postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS ) park_renames = _dynamic_poi_metric_renames({"parks": "Park"}) park_counts_2km = park_counts_2km.rename( {k: v for k, v in park_renames.items() if k in park_counts_2km.columns} ) park_counts_5km = park_counts_5km.rename( {k: v for k, v in park_renames.items() if k in park_counts_5km.columns} ) park_distances = park_distances.rename( {k: v for k, v in park_renames.items() if k in park_distances.columns} ) # Join all results on postcode result = ( counts_2km.join(dynamic_counts_2km, on="postcode") .join(dynamic_counts_5km, on="postcode") .join(dynamic_distances, on="postcode") .join(park_counts_2km, on="postcode") .join(park_counts_5km, on="postcode") .join(park_distances, on="postcode") ) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB)") if __name__ == "__main__": main()