Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
|
|||
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
|
||||
GROCERIES_GROUP = "Groceries"
|
||||
|
||||
# Groceries categories EXCLUDED from the static "Number of grocery shops and
|
||||
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
|
||||
# are speciality food retail, not somewhere you do a grocery shop; together
|
||||
# they were ~a third of the group and inflated the headline count. The metric
|
||||
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
|
||||
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
|
||||
"Bakery",
|
||||
"Butcher & Fishmonger",
|
||||
"Deli & Specialty",
|
||||
"Off-Licence",
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||
# of green spaces that are only mapped as polygons in OSM.
|
||||
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
|
||||
# (open public recreation grounds) is borderline but kept: outside big cities
|
||||
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
|
||||
# excluded — a playground is not a park, and "Playground" is already its own
|
||||
# OSM-derived category. The remaining functions (Religious Grounds, Golf
|
||||
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
|
||||
# Facility) are clearly not parks.
|
||||
GREENSPACE_PARK_FUNCTIONS = {
|
||||
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
|
||||
"parks": ["Public Park Or Garden", "Playing Field"],
|
||||
}
|
||||
|
||||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
|
|
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:
|
|||
|
||||
|
||||
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
|
||||
"""Return the distinct `category` values for the Groceries group.
|
||||
"""Return the distinct `category` values for the static groceries metric.
|
||||
|
||||
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
|
||||
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
|
||||
with group "Groceries"; it never emits the literal "Supermarket". Collecting
|
||||
every Groceries category captures both the OSM strings and the brand names.
|
||||
Speciality food retail (bakeries, butchers, delis, off-licences) is
|
||||
excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES.
|
||||
"""
|
||||
if "group" not in pois.columns:
|
||||
raise ValueError("POI dataframe must include a 'group' column")
|
||||
return (
|
||||
pois.filter(pl.col("group") == GROCERIES_GROUP)
|
||||
pois.filter(
|
||||
(pl.col("group") == GROCERIES_GROUP)
|
||||
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
|
||||
)
|
||||
.select("category")
|
||||
.unique()
|
||||
.sort("category")
|
||||
|
|
@ -109,6 +133,40 @@ def _build_poi_category_groups(
|
|||
return groups, display_names
|
||||
|
||||
|
||||
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Collapse the greenspace frame to ONE representative row per site.
|
||||
|
||||
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
|
||||
right grain for nearest-distance (the nearest gate is what matters) but
|
||||
wildly over-counts "Number of amenities (Park) within Xkm" — a large park
|
||||
with 30 gates counted as 30 parks. Counting uses one row per site at the
|
||||
site centroid (falling back to the first access point when no centroid is
|
||||
available). Degrades gracefully: a legacy parquet without `site_id` is
|
||||
returned unchanged (gate-grain counts) rather than crashing.
|
||||
"""
|
||||
if "site_id" not in greenspace.columns:
|
||||
print(
|
||||
"WARNING: greenspace parquet has no site_id column; park counts "
|
||||
"will count access points, not sites (regenerate os_greenspace)"
|
||||
)
|
||||
return greenspace
|
||||
|
||||
keyed = greenspace.filter(pl.col("site_id").is_not_null())
|
||||
unkeyed = greenspace.filter(pl.col("site_id").is_null())
|
||||
|
||||
representatives = keyed.unique(subset=["site_id"], keep="first")
|
||||
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
|
||||
representatives = representatives.with_columns(
|
||||
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
|
||||
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
|
||||
)
|
||||
|
||||
frames = [representatives.select(greenspace.columns)]
|
||||
if len(unkeyed) > 0:
|
||||
frames.append(unkeyed)
|
||||
return pl.concat(frames)
|
||||
|
||||
|
||||
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
||||
renames: dict[str, str] = {}
|
||||
for group_key, category in display_names.items():
|
||||
|
|
@ -185,13 +243,16 @@ def main():
|
|||
|
||||
# Park counts and distances from OS Open Greenspace. They use the dynamic
|
||||
# amenity metric names so filters read through the same side-table path as
|
||||
# OSM-derived amenity metrics.
|
||||
# OSM-derived amenity metrics. Distances use the access-point grain (the
|
||||
# nearest park GATE is the right semantics); counts use one row per SITE so
|
||||
# a park with many gates counts once.
|
||||
greenspace = pl.read_parquet(args.greenspace)
|
||||
greenspace_sites = _greenspace_count_frame(greenspace)
|
||||
park_counts_2km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
)
|
||||
park_counts_5km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
||||
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
||||
)
|
||||
park_distances = min_distance_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue