perfect-postcode/pipeline/download/os_greenspace.py
2026-03-25 08:04:48 +00:00

127 lines
4 KiB
Python

"""Download OS Open Greenspace and extract site centroids.
Downloads the OS Open Greenspace dataset as ESRI Shapefile, computes
WGS84 centroids for each greenspace site polygon, and outputs a parquet
with lat/lng/category columns compatible with the POI proximity pipeline.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0
"""
import argparse
import tempfile
from pathlib import Path
import numpy as np
import polars as pl
import shapefile as shp
from pyproj import Transformer
from shapely.geometry import shape as to_shapely
from pipeline.utils.download import download, extract_zip
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def download_greenspace(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory() as cache_dir:
zip_path = Path(cache_dir) / "greenspace.zip"
extract_dir = Path(cache_dir) / "extracted"
download(URL, zip_path, timeout=300)
extract_zip(zip_path, extract_dir)
# Find the GreenspaceSite shapefile (not the AccessPoint one)
shp_files = list(extract_dir.rglob("*GreenspaceSite*.shp"))
if not shp_files:
shp_files = [
f
for f in extract_dir.rglob("*.shp")
if "AccessPoint" not in f.name
]
if not shp_files:
raise FileNotFoundError(
"No GreenspaceSite shapefile found in download"
)
print(f"Reading {shp_files[0].name}...")
reader = shp.Reader(str(shp_files[0]), encoding="latin-1")
# Find the "function" field (greenspace type)
field_names = [f[0] for f in reader.fields[1:]] # skip deletion flag
func_field = None
for name in field_names:
if "funct" in name.lower():
func_field = name
break
if func_field is None:
raise ValueError(
f"No 'function' field found. Available: {field_names}"
)
func_idx = field_names.index(func_field)
# Find a name field if available
name_idx = None
for name in field_names:
if "distname" in name.lower():
name_idx = field_names.index(name)
break
lats = []
lngs = []
categories = []
names = []
for sr in reader.shapeRecords():
func = sr.record[func_idx]
site_name = sr.record[name_idx] if name_idx is not None else ""
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
continue
centroid = geom.centroid
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
except Exception:
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
names.append(site_name or "")
df = pl.DataFrame(
{
"lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64),
"category": categories,
"name": names,
}
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} greenspace sites)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download OS Open Greenspace site centroids"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_greenspace(args.output)
if __name__ == "__main__":
main()