fix greenspace and add more schools

This commit is contained in:
Andras Schmelczer 2026-03-25 08:06:16 +00:00
parent 3a3e249bdd
commit 9cd2b8849c
4 changed files with 161 additions and 61 deletions

View file

@ -1,8 +1,13 @@
"""Download OS Open Greenspace and extract site centroids. """Download OS Open Greenspace and extract access points.
Downloads the OS Open Greenspace dataset as ESRI Shapefile, computes Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
WGS84 centroids for each greenspace site polygon, and outputs a parquet access point locations (park entrances). Each access point is tagged with
with lat/lng/category columns compatible with the POI proximity pipeline. its parent site's function type (e.g. Public Park Or Garden). Sites without
access points fall back to polygon centroids.
Using access points rather than polygon centroids gives much more accurate
distance calculations a property next to Hyde Park won't show 400m just
because the centroid is in the middle of the park.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0 License: Open Government Licence v3.0
@ -25,6 +30,115 @@ URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) _to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def _find_field(field_names: list[str], *needles: str) -> int | None:
"""Find the index of the first field whose lowercased name contains any needle."""
for i, name in enumerate(field_names):
lower = name.lower()
for needle in needles:
if needle in lower:
return i
return None
def _read_site_functions(shp_path: Path) -> dict[str, str]:
"""Build a mapping from site ID → function type from the GreenspaceSite shapefile."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
raise ValueError(f"Missing id/function fields. Available: {field_names}")
site_funcs = {}
for rec in reader.iterRecords():
site_funcs[rec[id_idx]] = rec[func_idx]
print(f" Loaded {len(site_funcs):,} site function mappings")
return site_funcs
def _read_access_points(
shp_path: Path, site_funcs: dict[str, str]
) -> tuple[list[float], list[float], list[str]]:
"""Read access points, tagging each with its parent site's function."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
# The access point shapefile has a reference field linking to the parent site
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is None:
raise ValueError(
f"No site reference field found in access points. Available: {field_names}"
)
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[ref_idx]
func = site_funcs.get(site_id)
if func is None:
skipped += 1
continue
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty:
continue
lng, lat = _to_wgs84.transform(geom.x, geom.y)
except Exception:
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
return lats, lngs, categories
def _read_site_centroids(
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
) -> tuple[list[float], list[float], list[str]]:
"""Read polygon centroids for sites that have no access points (fallback)."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
return [], [], []
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
if site_id in covered_ids:
continue
func = sr.record[func_idx]
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
continue
centroid = geom.centroid
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
except Exception:
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
return lats, lngs, categories
def download_greenspace(output: Path) -> None: def download_greenspace(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True) output.parent.mkdir(parents=True, exist_ok=True)
@ -35,77 +149,54 @@ def download_greenspace(output: Path) -> None:
download(URL, zip_path, timeout=300) download(URL, zip_path, timeout=300)
extract_zip(zip_path, extract_dir) extract_zip(zip_path, extract_dir)
# Find the GreenspaceSite shapefile (not the AccessPoint one) # Find both shapefiles
shp_files = list(extract_dir.rglob("*GreenspaceSite*.shp")) site_shps = list(extract_dir.rglob("*GreenspaceSite*.shp"))
if not shp_files: access_shps = list(extract_dir.rglob("*AccessPoint*.shp"))
shp_files = [
f
for f in extract_dir.rglob("*.shp")
if "AccessPoint" not in f.name
]
if not shp_files:
raise FileNotFoundError(
"No GreenspaceSite shapefile found in download"
)
print(f"Reading {shp_files[0].name}...") if not site_shps:
reader = shp.Reader(str(shp_files[0]), encoding="latin-1") raise FileNotFoundError("No GreenspaceSite shapefile found")
if not access_shps:
raise FileNotFoundError("No AccessPoint shapefile found")
# Find the "function" field (greenspace type) # Step 1: Build site ID → function mapping
field_names = [f[0] for f in reader.fields[1:]] # skip deletion flag print(f"Reading {site_shps[0].name} for function types...")
func_field = None site_funcs = _read_site_functions(site_shps[0])
for name in field_names:
if "funct" in name.lower():
func_field = name
break
if func_field is None:
raise ValueError(
f"No 'function' field found. Available: {field_names}"
)
func_idx = field_names.index(func_field)
# Find a name field if available # Step 2: Read access points (primary — park entrances)
name_idx = None print(f"Reading {access_shps[0].name}...")
for name in field_names: ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
if "distname" in name.lower(): print(f" {len(ap_lats):,} access points loaded")
name_idx = field_names.index(name)
break
lats = [] # Step 3: Fall back to centroids for sites without any access points
lngs = [] covered_ids = set()
categories = [] reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
names = [] field_names = [f[0] for f in reader.fields[1:]]
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is not None:
for rec in reader.iterRecords():
covered_ids.add(rec[ref_idx])
for sr in reader.shapeRecords(): print("Adding centroids for sites without access points...")
func = sr.record[func_idx] fb_lats, fb_lngs, fb_cats = _read_site_centroids(
site_name = sr.record[name_idx] if name_idx is not None else "" site_shps[0], site_funcs, covered_ids
)
print(f" {len(fb_lats):,} centroid fallbacks added")
try: lats = ap_lats + fb_lats
geom = to_shapely(sr.shape.__geo_interface__) lngs = ap_lngs + fb_lngs
if geom.is_empty or not geom.is_valid: categories = ap_cats + fb_cats
continue
centroid = geom.centroid
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
except Exception:
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
names.append(site_name or "")
df = pl.DataFrame( df = pl.DataFrame(
{ {
"lat": np.array(lats, dtype=np.float64), "lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64), "lng": np.array(lngs, dtype=np.float64),
"category": categories, "category": categories,
"name": names,
} }
) )
df.write_parquet(output) df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024) size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} greenspace sites)") print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} points)")
counts = df.group_by("category").len().sort("len", descending=True) counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True): for row in counts.iter_rows(named=True):
@ -114,7 +205,7 @@ def download_greenspace(output: Path) -> None:
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download OS Open Greenspace site centroids" description="Download OS Open Greenspace access points"
) )
parser.add_argument( parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path" "--output", type=Path, required=True, help="Output parquet file path"

View file

@ -12,7 +12,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
PROTOMAPS_BASE = "https://build.protomaps.com" PROTOMAPS_BASE = "https://build.protomaps.com"
UK_BBOX = "-10.5,49.5,2.5,61" UK_BBOX = "-10.5,49,5,61"
MAX_AGE_DAYS = 14 MAX_AGE_DAYS = 14

View file

@ -59,6 +59,8 @@ _AREA_COLUMNS = [
# Schools # Schools
"Good+ primary schools within 5km", "Good+ primary schools within 5km",
"Good+ secondary schools within 5km", "Good+ secondary schools within 5km",
"Good+ primary schools within 2km",
"Good+ secondary schools within 2km",
# Demographics # Demographics
"Median age", "Median age",
] ]
@ -331,6 +333,8 @@ def _build(
"noise_lden_db": "Noise (dB)", "noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km", "good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km", "good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"max_download_speed": "Max available download speed (Mbps)", "max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)", "serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)", "minor_crime_avg_yr": "Minor crime (avg/yr)",

View file

@ -60,9 +60,14 @@ def main():
# Load all postcodes for proximity counting # Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"}) postcodes = arcgis.rename({"lng": "lon"})
result = count_pois_per_postcode( counts_5km = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
) )
counts_2km = count_pois_per_postcode(
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
)
result = counts_5km.join(counts_2km, on="postcode")
result.write_parquet(args.output) result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024) size_mb = args.output.stat().st_size / (1024 * 1024)