perfect-postcode/pipeline/transform/poi_proximity.py
Andras Schmelczer be02fc16bb
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 8m20s
CI / Check (push) Failing after 10m40s
Test changes
2026-05-09 11:35:38 +01:00

197 lines
6.6 KiB
Python

"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
import argparse
import re
import unicodedata
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode
# POI category groups for proximity counting (2km radius).
# Names must match the friendly names produced by transform_poi.py / naptan.py.
POI_GROUPS_2KM = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# Groups for which to compute distance to nearest POI (from filtered POIs).
# Keep `train_tube` for the existing backend feature; the individual POI
# distance filters below power the frontend dropdown.
DISTANCE_GROUPS = {
"train_tube": ["Tube station", "Rail station"],
"grocery_store": [
"Greengrocer",
"Supermarket",
"Convenience Store",
"Waitrose",
"Tesco",
],
"tube_station": ["Tube station"],
"rail_station": ["Rail station"],
"waitrose": ["Waitrose"],
"tesco": ["Tesco"],
"cafe": ["Café"],
"pub": ["Pub"],
"restaurant": ["Restaurant"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
def _poi_category_slug(category: str) -> str:
ascii_text = (
unicodedata.normalize("NFKD", category)
.encode("ascii", "ignore")
.decode("ascii")
.lower()
)
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
return slug or "poi"
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
"""Build one proximity group for each POI category selected for filters."""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
categories = (
pois.group_by("group", "category")
.len()
.filter(
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
| (
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
)
)
.select("category")
.sort("category")
.to_series()
.to_list()
)
used_slugs: dict[str, int] = {}
groups: dict[str, list[str]] = {}
display_names: dict[str, str] = {}
for category in categories:
if not isinstance(category, str) or not category:
continue
base_slug = f"poi_{_poi_category_slug(category)}"
slug_count = used_slugs.get(base_slug, 0)
used_slugs[base_slug] = slug_count + 1
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
groups[group_key] = [category]
display_names[group_key] = category
return groups, display_names
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
return renames
def main():
parser = argparse.ArgumentParser(
description="Count POIs within radius per postcode"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--pois", type=Path, required=True, help="Filtered POIs parquet"
)
parser.add_argument(
"--greenspace",
type=Path,
required=True,
help="OS Open Greenspace centroids parquet",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
postcodes = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
)
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count amenity POIs within 2km
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
)
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
# the selected public transport, grocery, and leisure categories.
dynamic_counts_2km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=2
)
dynamic_counts_5km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=5
)
dynamic_distances = min_distance_per_postcode(
postcodes, pois, groups=poi_category_groups
)
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
dynamic_counts_2km = dynamic_counts_2km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
)
dynamic_counts_5km = dynamic_counts_5km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
)
dynamic_distances = dynamic_distances.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
)
# Distance to nearest train/tube station (from filtered POIs)
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
# Park counts and distances from OS Open Greenspace
greenspace = pl.read_parquet(args.greenspace)
park_counts_1km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=1
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
)
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(dynamic_counts_2km, on="postcode")
.join(dynamic_counts_5km, on="postcode")
.join(dynamic_distances, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_distances, on="postcode")
)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()