"""Download OS Open Greenspace and extract site centroids. Downloads the OS Open Greenspace dataset as ESRI Shapefile, computes WGS84 centroids for each greenspace site polygon, and outputs a parquet with lat/lng/category columns compatible with the POI proximity pipeline. Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace License: Open Government Licence v3.0 """ import argparse import tempfile from pathlib import Path import numpy as np import polars as pl import shapefile as shp from pyproj import Transformer from shapely.geometry import shape as to_shapely from pipeline.utils.download import download, extract_zip URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect" _to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) def download_greenspace(output: Path) -> None: output.parent.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory() as cache_dir: zip_path = Path(cache_dir) / "greenspace.zip" extract_dir = Path(cache_dir) / "extracted" download(URL, zip_path, timeout=300) extract_zip(zip_path, extract_dir) # Find the GreenspaceSite shapefile (not the AccessPoint one) shp_files = list(extract_dir.rglob("*GreenspaceSite*.shp")) if not shp_files: shp_files = [ f for f in extract_dir.rglob("*.shp") if "AccessPoint" not in f.name ] if not shp_files: raise FileNotFoundError( "No GreenspaceSite shapefile found in download" ) print(f"Reading {shp_files[0].name}...") reader = shp.Reader(str(shp_files[0]), encoding="latin-1") # Find the "function" field (greenspace type) field_names = [f[0] for f in reader.fields[1:]] # skip deletion flag func_field = None for name in field_names: if "funct" in name.lower(): func_field = name break if func_field is None: raise ValueError( f"No 'function' field found. Available: {field_names}" ) func_idx = field_names.index(func_field) # Find a name field if available name_idx = None for name in field_names: if "distname" in name.lower(): name_idx = field_names.index(name) break lats = [] lngs = [] categories = [] names = [] for sr in reader.shapeRecords(): func = sr.record[func_idx] site_name = sr.record[name_idx] if name_idx is not None else "" try: geom = to_shapely(sr.shape.__geo_interface__) if geom.is_empty or not geom.is_valid: continue centroid = geom.centroid lng, lat = _to_wgs84.transform(centroid.x, centroid.y) except Exception: continue lats.append(lat) lngs.append(lng) categories.append(func) names.append(site_name or "") df = pl.DataFrame( { "lat": np.array(lats, dtype=np.float64), "lng": np.array(lngs, dtype=np.float64), "category": categories, "name": names, } ) df.write_parquet(output) size_mb = output.stat().st_size / (1024 * 1024) print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} greenspace sites)") counts = df.group_by("category").len().sort("len", descending=True) for row in counts.iter_rows(named=True): print(f" {row['category']}: {row['len']:,}") def main() -> None: parser = argparse.ArgumentParser( description="Download OS Open Greenspace site centroids" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_greenspace(args.output) if __name__ == "__main__": main()