127 lines
4 KiB
Python
127 lines
4 KiB
Python
"""Download OS Open Greenspace and extract site centroids.
|
|
|
|
Downloads the OS Open Greenspace dataset as ESRI Shapefile, computes
|
|
WGS84 centroids for each greenspace site polygon, and outputs a parquet
|
|
with lat/lng/category columns compatible with the POI proximity pipeline.
|
|
|
|
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
|
|
License: Open Government Licence v3.0
|
|
"""
|
|
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
import shapefile as shp
|
|
from pyproj import Transformer
|
|
from shapely.geometry import shape as to_shapely
|
|
|
|
from pipeline.utils.download import download, extract_zip
|
|
|
|
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
|
|
|
|
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
|
|
|
|
|
def download_greenspace(output: Path) -> None:
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with tempfile.TemporaryDirectory() as cache_dir:
|
|
zip_path = Path(cache_dir) / "greenspace.zip"
|
|
extract_dir = Path(cache_dir) / "extracted"
|
|
|
|
download(URL, zip_path, timeout=300)
|
|
extract_zip(zip_path, extract_dir)
|
|
|
|
# Find the GreenspaceSite shapefile (not the AccessPoint one)
|
|
shp_files = list(extract_dir.rglob("*GreenspaceSite*.shp"))
|
|
if not shp_files:
|
|
shp_files = [
|
|
f
|
|
for f in extract_dir.rglob("*.shp")
|
|
if "AccessPoint" not in f.name
|
|
]
|
|
if not shp_files:
|
|
raise FileNotFoundError(
|
|
"No GreenspaceSite shapefile found in download"
|
|
)
|
|
|
|
print(f"Reading {shp_files[0].name}...")
|
|
reader = shp.Reader(str(shp_files[0]), encoding="latin-1")
|
|
|
|
# Find the "function" field (greenspace type)
|
|
field_names = [f[0] for f in reader.fields[1:]] # skip deletion flag
|
|
func_field = None
|
|
for name in field_names:
|
|
if "funct" in name.lower():
|
|
func_field = name
|
|
break
|
|
if func_field is None:
|
|
raise ValueError(
|
|
f"No 'function' field found. Available: {field_names}"
|
|
)
|
|
func_idx = field_names.index(func_field)
|
|
|
|
# Find a name field if available
|
|
name_idx = None
|
|
for name in field_names:
|
|
if "distname" in name.lower():
|
|
name_idx = field_names.index(name)
|
|
break
|
|
|
|
lats = []
|
|
lngs = []
|
|
categories = []
|
|
names = []
|
|
|
|
for sr in reader.shapeRecords():
|
|
func = sr.record[func_idx]
|
|
site_name = sr.record[name_idx] if name_idx is not None else ""
|
|
|
|
try:
|
|
geom = to_shapely(sr.shape.__geo_interface__)
|
|
if geom.is_empty or not geom.is_valid:
|
|
continue
|
|
centroid = geom.centroid
|
|
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
|
|
except Exception:
|
|
continue
|
|
|
|
lats.append(lat)
|
|
lngs.append(lng)
|
|
categories.append(func)
|
|
names.append(site_name or "")
|
|
|
|
df = pl.DataFrame(
|
|
{
|
|
"lat": np.array(lats, dtype=np.float64),
|
|
"lng": np.array(lngs, dtype=np.float64),
|
|
"category": categories,
|
|
"name": names,
|
|
}
|
|
)
|
|
|
|
df.write_parquet(output)
|
|
size_mb = output.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} greenspace sites)")
|
|
|
|
counts = df.group_by("category").len().sort("len", descending=True)
|
|
for row in counts.iter_rows(named=True):
|
|
print(f" {row['category']}: {row['len']:,}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download OS Open Greenspace site centroids"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
download_greenspace(args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|