287 lines
11 KiB
Python
287 lines
11 KiB
Python
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
|
|
|
|
import argparse
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_postcode
|
|
|
|
|
|
# POI category groups for proximity counting (2km radius).
|
|
# Names must match the friendly names produced by transform_poi.py / naptan.py.
|
|
# "groceries" is filled in dynamically by _groceries_categories() because the
|
|
# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
|
|
# than the literal "Supermarket"; counting only the OSM strings here severely
|
|
# understates the metric. See _groceries_categories below.
|
|
POI_GROUPS_2KM = {
|
|
"restaurants": ["Restaurant", "Fast Food"],
|
|
}
|
|
|
|
# POI group whose members are counted for the static "groceries" 2km metric.
|
|
# Covers both the OSM grocery categories (Supermarket, Convenience Store,
|
|
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
|
|
GROCERIES_GROUP = "Groceries"
|
|
|
|
# Groceries categories EXCLUDED from the static "Number of grocery shops and
|
|
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
|
|
# are speciality food retail, not somewhere you do a grocery shop; together
|
|
# they were ~a third of the group and inflated the headline count. The metric
|
|
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
|
|
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
|
|
"Bakery",
|
|
"Butcher & Fishmonger",
|
|
"Deli & Specialty",
|
|
"Off-Licence",
|
|
}
|
|
|
|
# OS Open Greenspace function types used for park counts and distance calculation.
|
|
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
|
# of green spaces that are only mapped as polygons in OSM.
|
|
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
|
|
# (open public recreation grounds) is borderline but kept: outside big cities
|
|
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
|
|
# excluded — a playground is not a park, and "Playground" is already its own
|
|
# OSM-derived category. The remaining functions (Religious Grounds, Golf
|
|
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
|
|
# Facility) are clearly not parks.
|
|
GREENSPACE_PARK_FUNCTIONS = {
|
|
"parks": ["Public Park Or Garden", "Playing Field"],
|
|
}
|
|
|
|
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
|
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure", "Health"}
|
|
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
|
|
DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}
|
|
|
|
|
|
def _poi_category_slug(category: str) -> str:
|
|
ascii_text = (
|
|
unicodedata.normalize("NFKD", category)
|
|
.encode("ascii", "ignore")
|
|
.decode("ascii")
|
|
.lower()
|
|
)
|
|
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
|
|
return slug or "poi"
|
|
|
|
|
|
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
|
|
"""Return the distinct `category` values for the static groceries metric.
|
|
|
|
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
|
|
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
|
|
with group "Groceries"; it never emits the literal "Supermarket". Collecting
|
|
every Groceries category captures both the OSM strings and the brand names.
|
|
Speciality food retail (bakeries, butchers, delis, off-licences) is
|
|
excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES.
|
|
"""
|
|
if "group" not in pois.columns:
|
|
raise ValueError("POI dataframe must include a 'group' column")
|
|
return (
|
|
pois.filter(
|
|
(pl.col("group") == GROCERIES_GROUP)
|
|
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
|
|
)
|
|
.select("category")
|
|
.unique()
|
|
.sort("category")
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
|
|
|
|
def _build_poi_category_groups(
|
|
pois: pl.DataFrame,
|
|
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
|
"""Build one proximity group for each POI category selected for filters."""
|
|
if "group" not in pois.columns:
|
|
raise ValueError("POI dataframe must include a 'group' column")
|
|
|
|
categories = (
|
|
pois.group_by("group", "category")
|
|
.len()
|
|
.filter(
|
|
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
|
|
| (
|
|
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
|
|
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
|
|
)
|
|
)
|
|
.filter(~pl.col("category").is_in(list(DYNAMIC_FILTER_EXCLUDED_CATEGORIES)))
|
|
.select("category")
|
|
.sort("category")
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
used_slugs: dict[str, int] = {}
|
|
groups: dict[str, list[str]] = {}
|
|
display_names: dict[str, str] = {}
|
|
|
|
for category in categories:
|
|
if not isinstance(category, str) or not category:
|
|
continue
|
|
base_slug = f"poi_{_poi_category_slug(category)}"
|
|
slug_count = used_slugs.get(base_slug, 0)
|
|
used_slugs[base_slug] = slug_count + 1
|
|
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
|
|
groups[group_key] = [category]
|
|
display_names[group_key] = category
|
|
|
|
return groups, display_names
|
|
|
|
|
|
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
|
|
"""Collapse the greenspace frame to ONE representative row per site.
|
|
|
|
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
|
|
right grain for nearest-distance (the nearest gate is what matters) but
|
|
wildly over-counts "Number of amenities (Park) within Xkm" — a large park
|
|
with 30 gates counted as 30 parks. Counting uses one row per site at the
|
|
site centroid (falling back to the first access point when no centroid is
|
|
available). Degrades gracefully: a legacy parquet without `site_id` is
|
|
returned unchanged (gate-grain counts) rather than crashing.
|
|
"""
|
|
if "site_id" not in greenspace.columns:
|
|
print(
|
|
"WARNING: greenspace parquet has no site_id column; park counts "
|
|
"will count access points, not sites (regenerate os_greenspace)"
|
|
)
|
|
return greenspace
|
|
|
|
keyed = greenspace.filter(pl.col("site_id").is_not_null())
|
|
unkeyed = greenspace.filter(pl.col("site_id").is_null())
|
|
|
|
representatives = keyed.unique(subset=["site_id"], keep="first")
|
|
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
|
|
representatives = representatives.with_columns(
|
|
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
|
|
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
|
|
)
|
|
|
|
frames = [representatives.select(greenspace.columns)]
|
|
if len(unkeyed) > 0:
|
|
frames.append(unkeyed)
|
|
return pl.concat(frames)
|
|
|
|
|
|
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
|
renames: dict[str, str] = {}
|
|
for group_key, category in display_names.items():
|
|
renames[f"{group_key}_nearest_km"] = (
|
|
f"Distance to nearest amenity ({category}) (km)"
|
|
)
|
|
renames[f"{group_key}_2km"] = f"Number of amenities ({category}) within 2km"
|
|
renames[f"{group_key}_5km"] = f"Number of amenities ({category}) within 5km"
|
|
return renames
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Count POIs within radius per postcode"
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--pois", type=Path, required=True, help="Filtered POIs parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--greenspace",
|
|
type=Path,
|
|
required=True,
|
|
help="OS Open Greenspace centroids parquet",
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
postcodes = pl.read_parquet(args.arcgis).select(
|
|
pl.col("pcds").alias("postcode"),
|
|
"lat",
|
|
pl.col("long").alias("lon"),
|
|
)
|
|
|
|
pois = pl.read_parquet(args.pois)
|
|
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
|
|
|
# Count static amenity groups within 2km. "groceries" is matched against
|
|
# every Groceries category (OSM strings + GEOLYTIX brand names) so that
|
|
# postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
|
|
groups_2km = {
|
|
**POI_GROUPS_2KM,
|
|
"groceries": _groceries_categories(pois),
|
|
}
|
|
counts_2km = count_pois_per_postcode(
|
|
postcodes, pois, groups=groups_2km, radius_km=2
|
|
)
|
|
|
|
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
|
|
# the selected public transport, grocery, and leisure categories.
|
|
dynamic_counts_2km = count_pois_per_postcode(
|
|
postcodes, pois, groups=poi_category_groups, radius_km=2
|
|
)
|
|
dynamic_counts_5km = count_pois_per_postcode(
|
|
postcodes, pois, groups=poi_category_groups, radius_km=5
|
|
)
|
|
dynamic_distances = min_distance_per_postcode(
|
|
postcodes, pois, groups=poi_category_groups
|
|
)
|
|
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
|
|
dynamic_counts_2km = dynamic_counts_2km.rename(
|
|
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
|
|
)
|
|
dynamic_counts_5km = dynamic_counts_5km.rename(
|
|
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
|
|
)
|
|
dynamic_distances = dynamic_distances.rename(
|
|
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
|
|
)
|
|
|
|
# Park counts and distances from OS Open Greenspace. They use the dynamic
|
|
# amenity metric names so filters read through the same side-table path as
|
|
# OSM-derived amenity metrics. Distances use the access-point grain (the
|
|
# nearest park GATE is the right semantics); counts use one row per SITE so
|
|
# a park with many gates counts once.
|
|
greenspace = pl.read_parquet(args.greenspace)
|
|
greenspace_sites = _greenspace_count_frame(greenspace)
|
|
park_counts_2km = count_pois_per_postcode(
|
|
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
|
)
|
|
park_counts_5km = count_pois_per_postcode(
|
|
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
|
)
|
|
park_distances = min_distance_per_postcode(
|
|
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
|
|
)
|
|
park_renames = _dynamic_poi_metric_renames({"parks": "Park"})
|
|
park_counts_2km = park_counts_2km.rename(
|
|
{k: v for k, v in park_renames.items() if k in park_counts_2km.columns}
|
|
)
|
|
park_counts_5km = park_counts_5km.rename(
|
|
{k: v for k, v in park_renames.items() if k in park_counts_5km.columns}
|
|
)
|
|
park_distances = park_distances.rename(
|
|
{k: v for k, v in park_renames.items() if k in park_distances.columns}
|
|
)
|
|
|
|
# Join all results on postcode
|
|
result = (
|
|
counts_2km.join(dynamic_counts_2km, on="postcode")
|
|
.join(dynamic_counts_5km, on="postcode")
|
|
.join(dynamic_distances, on="postcode")
|
|
.join(park_counts_2km, on="postcode")
|
|
.join(park_counts_5km, on="postcode")
|
|
.join(park_distances, on="postcode")
|
|
)
|
|
|
|
result.write_parquet(args.output)
|
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|