Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
GROCERIES_GROUP = "Groceries"
# Groceries categories EXCLUDED from the static "Number of grocery shops and
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
# are speciality food retail, not somewhere you do a grocery shop; together
# they were ~a third of the group and inflated the headline count. The metric
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
}
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
# (open public recreation grounds) is borderline but kept: outside big cities
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
# excluded — a playground is not a park, and "Playground" is already its own
# OSM-derived category. The remaining functions (Religious Grounds, Golf
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
# Facility) are clearly not parks.
GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
"parks": ["Public Park Or Garden", "Playing Field"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
"""Return the distinct `category` values for the Groceries group.
"""Return the distinct `category` values for the static groceries metric.
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
with group "Groceries"; it never emits the literal "Supermarket". Collecting
every Groceries category captures both the OSM strings and the brand names.
Speciality food retail (bakeries, butchers, delis, off-licences) is
excluded see GROCERY_STATIC_EXCLUDED_CATEGORIES.
"""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
return (
pois.filter(pl.col("group") == GROCERIES_GROUP)
pois.filter(
(pl.col("group") == GROCERIES_GROUP)
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
)
.select("category")
.unique()
.sort("category")
@ -109,6 +133,40 @@ def _build_poi_category_groups(
return groups, display_names
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
"""Collapse the greenspace frame to ONE representative row per site.
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
right grain for nearest-distance (the nearest gate is what matters) but
wildly over-counts "Number of amenities (Park) within Xkm" a large park
with 30 gates counted as 30 parks. Counting uses one row per site at the
site centroid (falling back to the first access point when no centroid is
available). Degrades gracefully: a legacy parquet without `site_id` is
returned unchanged (gate-grain counts) rather than crashing.
"""
if "site_id" not in greenspace.columns:
print(
"WARNING: greenspace parquet has no site_id column; park counts "
"will count access points, not sites (regenerate os_greenspace)"
)
return greenspace
keyed = greenspace.filter(pl.col("site_id").is_not_null())
unkeyed = greenspace.filter(pl.col("site_id").is_null())
representatives = keyed.unique(subset=["site_id"], keep="first")
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
representatives = representatives.with_columns(
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
)
frames = [representatives.select(greenspace.columns)]
if len(unkeyed) > 0:
frames.append(unkeyed)
return pl.concat(frames)
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
@ -185,13 +243,16 @@ def main():
# Park counts and distances from OS Open Greenspace. They use the dynamic
# amenity metric names so filters read through the same side-table path as
# OSM-derived amenity metrics.
# OSM-derived amenity metrics. Distances use the access-point grain (the
# nearest park GATE is the right semantics); counts use one row per SITE so
# a park with many gates counts once.
greenspace = pl.read_parquet(args.greenspace)
greenspace_sites = _greenspace_count_frame(greenspace)
park_counts_2km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
)
park_counts_5km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS