Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -2,12 +2,15 @@
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
access point locations (park entrances). Each access point is tagged with
its parent site's function type (e.g. Public Park Or Garden). Sites without
access points fall back to polygon centroids.
its parent site's function type (e.g. Public Park Or Garden), the parent
site id and the site's polygon centroid. Sites without access points fall
back to polygon centroids.
Using access points rather than polygon centroids gives much more accurate
distance calculations a property next to Hyde Park won't show 400m just
because the centroid is in the middle of the park.
because the centroid is in the middle of the park. The site id / centroid
columns let downstream consumers (poi_proximity) collapse the frame back to
one row per SITE for counting, so a park with 30 gates counts as one park.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
def _read_access_points(
shp_path: Path, site_funcs: dict[str, str]
) -> tuple[list[float], list[float], list[str]]:
"""Read access points, tagging each with its parent site's function."""
) -> tuple[list[float], list[float], list[str], list[str]]:
"""Read access points, tagging each with its parent site's function and id."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
@ -80,6 +83,7 @@ def _read_access_points(
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
site_ids: list[str] = []
skipped = 0
error_skipped = 0
@ -107,6 +111,7 @@ def _read_access_points(
lats.append(lat)
lngs.append(lng)
categories.append(func)
site_ids.append(str(site_id))
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
@ -116,31 +121,26 @@ def _read_access_points(
error_skipped,
)
return lats, lngs, categories
return lats, lngs, categories, site_ids
def _read_site_centroids(
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
) -> tuple[list[float], list[float], list[str]]:
"""Read polygon centroids for sites that have no access points (fallback)."""
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
"""Compute the WGS84 polygon centroid of every greenspace site.
Used both as the representative point for site-level counting and as the
location fallback for sites that have no access points.
"""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
return [], [], []
if id_idx is None:
return {}
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
centroids: dict[str, tuple[float, float]] = {}
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
if site_id in covered_ids:
continue
func = sr.record[func_idx]
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
@ -156,9 +156,7 @@ def _read_site_centroids(
)
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
centroids[str(site_id)] = (lat, lng)
if error_skipped:
logger.warning(
@ -166,7 +164,7 @@ def _read_site_centroids(
error_skipped,
)
return lats, lngs, categories
return centroids
def download_greenspace(output: Path) -> None:
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
# Step 2: Read access points (primary — park entrances)
print(f"Reading {access_shps[0].name}...")
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
access_shps[0], site_funcs
)
print(f" {len(ap_lats):,} access points loaded")
# Step 3: Fall back to centroids for sites without any access points
covered_ids = set()
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is not None:
for rec in reader.iterRecords():
covered_ids.add(rec[ref_idx])
# Step 3: Compute every site's centroid: the representative point for
# site-level counting, and the location fallback for sites without any
# access points.
print("Computing site centroids...")
centroids = _read_site_centroids(site_shps[0])
print(f" {len(centroids):,} site centroids computed")
print("Adding centroids for sites without access points...")
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
site_shps[0], site_funcs, covered_ids
)
covered_ids = set(ap_site_ids)
fb_lats: list[float] = []
fb_lngs: list[float] = []
fb_cats: list[str] = []
fb_site_ids: list[str] = []
for site_id, (lat, lng) in centroids.items():
if site_id in covered_ids:
continue
func = site_funcs.get(site_id)
if func is None:
continue
fb_lats.append(lat)
fb_lngs.append(lng)
fb_cats.append(func)
fb_site_ids.append(site_id)
print(f" {len(fb_lats):,} centroid fallbacks added")
lats = ap_lats + fb_lats
lngs = ap_lngs + fb_lngs
categories = ap_cats + fb_cats
site_ids = ap_site_ids + fb_site_ids
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
df = pl.DataFrame(
{
"lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64),
"category": categories,
"site_id": site_ids,
# Site polygon centroid (null when the centroid could not be
# computed): the representative point when collapsing to one row
# per site for counting.
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
}
)