perfect-postcode/pipeline/download/os_greenspace.py

266 lines
9.2 KiB
Python

"""Download OS Open Greenspace and extract access points.
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
access point locations (park entrances). Each access point is tagged with
its parent site's function type (e.g. Public Park Or Garden), the parent
site id and the site's polygon centroid. Sites without access points fall
back to polygon centroids.
Using access points rather than polygon centroids gives much more accurate
distance calculations — a property next to Hyde Park won't show 400m just
because the centroid is in the middle of the park. The site id / centroid
columns let downstream consumers (poi_proximity) collapse the frame back to
one row per SITE for counting, so a park with 30 gates counts as one park.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0
"""
import argparse
import logging
import tempfile
from pathlib import Path
import numpy as np
import polars as pl
import shapefile as shp
from pyproj import Transformer
from shapely.errors import GEOSException
from shapely.geometry import shape as to_shapely
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.download import download, extract_zip
logger = logging.getLogger(__name__)
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def _find_field(field_names: list[str], *needles: str) -> int | None:
"""Find the index of the first field whose lowercased name contains any needle."""
for i, name in enumerate(field_names):
lower = name.lower()
for needle in needles:
if needle in lower:
return i
return None
def _read_site_functions(shp_path: Path) -> dict[str, str]:
"""Build a mapping from site ID → function type from the GreenspaceSite shapefile."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
raise ValueError(f"Missing id/function fields. Available: {field_names}")
site_funcs = {}
for rec in reader.iterRecords():
site_funcs[rec[id_idx]] = rec[func_idx]
print(f" Loaded {len(site_funcs):,} site function mappings")
return site_funcs
def _read_access_points(
shp_path: Path, site_funcs: dict[str, str]
) -> tuple[list[float], list[float], list[str], list[str]]:
"""Read access points, tagging each with its parent site's function and id."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
# The access point shapefile has a reference field linking to the parent site
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is None:
raise ValueError(
f"No site reference field found in access points. Available: {field_names}"
)
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
site_ids: list[str] = []
skipped = 0
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[ref_idx]
func = site_funcs.get(site_id)
if func is None:
skipped += 1
continue
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty:
continue
lng, lat = _to_wgs84.transform(geom.x, geom.y)
except (GEOSException, ValueError, AttributeError, TypeError):
error_skipped += 1
logger.warning(
"Failed to process access point geometry for site_id=%s",
site_id,
exc_info=True,
)
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
site_ids.append(str(site_id))
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
if error_skipped:
logger.warning(
"Skipped %d access point records due to geometry/transform errors",
error_skipped,
)
return lats, lngs, categories, site_ids
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
"""Compute the WGS84 polygon centroid of every greenspace site.
Used both as the representative point for site-level counting and as the
location fallback for sites that have no access points.
"""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
if id_idx is None:
return {}
centroids: dict[str, tuple[float, float]] = {}
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
continue
centroid = geom.centroid
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
except (GEOSException, ValueError, AttributeError, TypeError):
error_skipped += 1
logger.warning(
"Failed to compute centroid for site_id=%s",
site_id,
exc_info=True,
)
continue
centroids[str(site_id)] = (lat, lng)
if error_skipped:
logger.warning(
"Skipped %d site centroid records due to geometry/transform errors",
error_skipped,
)
return centroids
def download_greenspace(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "greenspace.zip"
extract_dir = Path(cache_dir) / "extracted"
download(URL, zip_path, timeout=300)
extract_zip(zip_path, extract_dir)
# Find both shapefiles
site_shps = list(extract_dir.rglob("*GreenspaceSite*.shp"))
access_shps = list(extract_dir.rglob("*AccessPoint*.shp"))
if not site_shps:
raise FileNotFoundError("No GreenspaceSite shapefile found")
if not access_shps:
raise FileNotFoundError("No AccessPoint shapefile found")
# Step 1: Build site ID → function mapping
print(f"Reading {site_shps[0].name} for function types...")
site_funcs = _read_site_functions(site_shps[0])
# Step 2: Read access points (primary — park entrances)
print(f"Reading {access_shps[0].name}...")
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
access_shps[0], site_funcs
)
print(f" {len(ap_lats):,} access points loaded")
# Step 3: Compute every site's centroid: the representative point for
# site-level counting, and the location fallback for sites without any
# access points.
print("Computing site centroids...")
centroids = _read_site_centroids(site_shps[0])
print(f" {len(centroids):,} site centroids computed")
covered_ids = set(ap_site_ids)
fb_lats: list[float] = []
fb_lngs: list[float] = []
fb_cats: list[str] = []
fb_site_ids: list[str] = []
for site_id, (lat, lng) in centroids.items():
if site_id in covered_ids:
continue
func = site_funcs.get(site_id)
if func is None:
continue
fb_lats.append(lat)
fb_lngs.append(lng)
fb_cats.append(func)
fb_site_ids.append(site_id)
print(f" {len(fb_lats):,} centroid fallbacks added")
lats = ap_lats + fb_lats
lngs = ap_lngs + fb_lngs
categories = ap_cats + fb_cats
site_ids = ap_site_ids + fb_site_ids
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
df = pl.DataFrame(
{
"lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64),
"category": categories,
"site_id": site_ids,
# Site polygon centroid (null when the centroid could not be
# computed): the representative point when collapsing to one row
# per site for counting.
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
}
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} points)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download OS Open Greenspace access points"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_greenspace(args.output)
if __name__ == "__main__":
main()