Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -2,12 +2,15 @@
|
|||
|
||||
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
|
||||
access point locations (park entrances). Each access point is tagged with
|
||||
its parent site's function type (e.g. Public Park Or Garden). Sites without
|
||||
access points fall back to polygon centroids.
|
||||
its parent site's function type (e.g. Public Park Or Garden), the parent
|
||||
site id and the site's polygon centroid. Sites without access points fall
|
||||
back to polygon centroids.
|
||||
|
||||
Using access points rather than polygon centroids gives much more accurate
|
||||
distance calculations — a property next to Hyde Park won't show 400m just
|
||||
because the centroid is in the middle of the park.
|
||||
because the centroid is in the middle of the park. The site id / centroid
|
||||
columns let downstream consumers (poi_proximity) collapse the frame back to
|
||||
one row per SITE for counting, so a park with 30 gates counts as one park.
|
||||
|
||||
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
|
||||
License: Open Government Licence v3.0
|
||||
|
|
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
|
|||
|
||||
def _read_access_points(
|
||||
shp_path: Path, site_funcs: dict[str, str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function."""
|
||||
) -> tuple[list[float], list[float], list[str], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function and id."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
|
||||
|
|
@ -80,6 +83,7 @@ def _read_access_points(
|
|||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
site_ids: list[str] = []
|
||||
skipped = 0
|
||||
error_skipped = 0
|
||||
|
||||
|
|
@ -107,6 +111,7 @@ def _read_access_points(
|
|||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
site_ids.append(str(site_id))
|
||||
|
||||
if skipped:
|
||||
print(f" Skipped {skipped:,} access points with unknown site ID")
|
||||
|
|
@ -116,31 +121,26 @@ def _read_access_points(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return lats, lngs, categories, site_ids
|
||||
|
||||
|
||||
def _read_site_centroids(
|
||||
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read polygon centroids for sites that have no access points (fallback)."""
|
||||
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
|
||||
"""Compute the WGS84 polygon centroid of every greenspace site.
|
||||
|
||||
Used both as the representative point for site-level counting and as the
|
||||
location fallback for sites that have no access points.
|
||||
"""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
id_idx = _find_field(field_names, "id")
|
||||
func_idx = _find_field(field_names, "funct")
|
||||
if id_idx is None or func_idx is None:
|
||||
return [], [], []
|
||||
if id_idx is None:
|
||||
return {}
|
||||
|
||||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
centroids: dict[str, tuple[float, float]] = {}
|
||||
error_skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[id_idx]
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
|
||||
func = sr.record[func_idx]
|
||||
try:
|
||||
geom = to_shapely(sr.shape.__geo_interface__)
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
|
|
@ -156,9 +156,7 @@ def _read_site_centroids(
|
|||
)
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
centroids[str(site_id)] = (lat, lng)
|
||||
|
||||
if error_skipped:
|
||||
logger.warning(
|
||||
|
|
@ -166,7 +164,7 @@ def _read_site_centroids(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return centroids
|
||||
|
||||
|
||||
def download_greenspace(output: Path) -> None:
|
||||
|
|
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
|
|||
|
||||
# Step 2: Read access points (primary — park entrances)
|
||||
print(f"Reading {access_shps[0].name}...")
|
||||
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
|
||||
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
|
||||
access_shps[0], site_funcs
|
||||
)
|
||||
print(f" {len(ap_lats):,} access points loaded")
|
||||
|
||||
# Step 3: Fall back to centroids for sites without any access points
|
||||
covered_ids = set()
|
||||
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
|
||||
if ref_idx is not None:
|
||||
for rec in reader.iterRecords():
|
||||
covered_ids.add(rec[ref_idx])
|
||||
# Step 3: Compute every site's centroid: the representative point for
|
||||
# site-level counting, and the location fallback for sites without any
|
||||
# access points.
|
||||
print("Computing site centroids...")
|
||||
centroids = _read_site_centroids(site_shps[0])
|
||||
print(f" {len(centroids):,} site centroids computed")
|
||||
|
||||
print("Adding centroids for sites without access points...")
|
||||
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
|
||||
site_shps[0], site_funcs, covered_ids
|
||||
)
|
||||
covered_ids = set(ap_site_ids)
|
||||
fb_lats: list[float] = []
|
||||
fb_lngs: list[float] = []
|
||||
fb_cats: list[str] = []
|
||||
fb_site_ids: list[str] = []
|
||||
for site_id, (lat, lng) in centroids.items():
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
func = site_funcs.get(site_id)
|
||||
if func is None:
|
||||
continue
|
||||
fb_lats.append(lat)
|
||||
fb_lngs.append(lng)
|
||||
fb_cats.append(func)
|
||||
fb_site_ids.append(site_id)
|
||||
print(f" {len(fb_lats):,} centroid fallbacks added")
|
||||
|
||||
lats = ap_lats + fb_lats
|
||||
lngs = ap_lngs + fb_lngs
|
||||
categories = ap_cats + fb_cats
|
||||
site_ids = ap_site_ids + fb_site_ids
|
||||
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
|
||||
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"lat": np.array(lats, dtype=np.float64),
|
||||
"lng": np.array(lngs, dtype=np.float64),
|
||||
"category": categories,
|
||||
"site_id": site_ids,
|
||||
# Site polygon centroid (null when the centroid could not be
|
||||
# computed): the representative point when collapsing to one row
|
||||
# per site for counting.
|
||||
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
|
||||
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue