"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs.""" import argparse import re import unicodedata from pathlib import Path import polars as pl from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode # POI category groups for proximity counting (2km radius). # Names must match the friendly names produced by transform_poi.py / naptan.py. POI_GROUPS_2KM = { "restaurants": ["Restaurant", "Fast Food"], "groceries": ["Greengrocer", "Supermarket", "Convenience Store"], } # OS Open Greenspace function types used for park counts and distance calculation. # Uses the authoritative OS dataset instead of OSM point POIs for better coverage # of green spaces that are only mapped as polygons in OSM. GREENSPACE_PARK_FUNCTIONS = { "parks": ["Public Park Or Garden", "Playing Field", "Play Space"], } GROCERY_DYNAMIC_FILTER_MIN_POIS = 100 DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"} DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"} DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"} def _poi_category_slug(category: str) -> str: ascii_text = ( unicodedata.normalize("NFKD", category) .encode("ascii", "ignore") .decode("ascii") .lower() ) slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_") return slug or "poi" def _build_poi_category_groups( pois: pl.DataFrame, ) -> tuple[dict[str, list[str]], dict[str, str]]: """Build one proximity group for each POI category selected for filters.""" if "group" not in pois.columns: raise ValueError("POI dataframe must include a 'group' column") categories = ( pois.group_by("group", "category") .len() .filter( pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS)) | ( pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS)) & (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS) ) ) .filter(~pl.col("category").is_in(list(DYNAMIC_FILTER_EXCLUDED_CATEGORIES))) .select("category") .sort("category") .to_series() .to_list() ) used_slugs: dict[str, int] = {} groups: dict[str, list[str]] = {} display_names: dict[str, str] = {} for category in categories: if not isinstance(category, str) or not category: continue base_slug = f"poi_{_poi_category_slug(category)}" slug_count = used_slugs.get(base_slug, 0) used_slugs[base_slug] = slug_count + 1 group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}" groups[group_key] = [category] display_names[group_key] = category return groups, display_names def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]: renames: dict[str, str] = {} for group_key, category in display_names.items(): renames[f"{group_key}_nearest_km"] = ( f"Distance to nearest amenity ({category}) (km)" ) renames[f"{group_key}_2km"] = f"Number of amenities ({category}) within 2km" renames[f"{group_key}_5km"] = f"Number of amenities ({category}) within 5km" return renames def main(): parser = argparse.ArgumentParser( description="Count POIs within radius per postcode" ) parser.add_argument( "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet" ) parser.add_argument( "--pois", type=Path, required=True, help="Filtered POIs parquet" ) parser.add_argument( "--greenspace", type=Path, required=True, help="OS Open Greenspace centroids parquet", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet path" ) args = parser.parse_args() postcodes = pl.read_parquet(args.arcgis).select( pl.col("pcds").alias("postcode"), "lat", pl.col("long").alias("lon"), ) pois = pl.read_parquet(args.pois) poi_category_groups, poi_display_names = _build_poi_category_groups(pois) # Count static amenity groups within 2km. counts_2km = count_pois_per_postcode( postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2 ) # Dynamic amenity filters: nearest distance plus counts within 2km and 5km for # the selected public transport, grocery, and leisure categories. dynamic_counts_2km = count_pois_per_postcode( postcodes, pois, groups=poi_category_groups, radius_km=2 ) dynamic_counts_5km = count_pois_per_postcode( postcodes, pois, groups=poi_category_groups, radius_km=5 ) dynamic_distances = min_distance_per_postcode( postcodes, pois, groups=poi_category_groups ) dynamic_renames = _dynamic_poi_metric_renames(poi_display_names) dynamic_counts_2km = dynamic_counts_2km.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns} ) dynamic_counts_5km = dynamic_counts_5km.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns} ) dynamic_distances = dynamic_distances.rename( {k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns} ) # Park counts and distances from OS Open Greenspace. They use the dynamic # amenity metric names so filters read through the same side-table path as # OSM-derived amenity metrics. greenspace = pl.read_parquet(args.greenspace) park_counts_2km = count_pois_per_postcode( postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2 ) park_counts_5km = count_pois_per_postcode( postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5 ) park_distances = min_distance_per_postcode( postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS ) park_renames = _dynamic_poi_metric_renames({"parks": "Park"}) park_counts_2km = park_counts_2km.rename( {k: v for k, v in park_renames.items() if k in park_counts_2km.columns} ) park_counts_5km = park_counts_5km.rename( {k: v for k, v in park_renames.items() if k in park_counts_5km.columns} ) park_distances = park_distances.rename( {k: v for k, v in park_renames.items() if k in park_distances.columns} ) # Join all results on postcode result = ( counts_2km.join(dynamic_counts_2km, on="postcode") .join(dynamic_counts_5km, on="postcode") .join(dynamic_distances, on="postcode") .join(park_counts_2km, on="postcode") .join(park_counts_5km, on="postcode") .join(park_distances, on="postcode") ) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB)") if __name__ == "__main__": main()