perfect-postcode/pipeline/download/greenspace_water.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

168 lines
5.4 KiB
Python

"""Extract park and water body polygons from OSM PBF for postcode boundary trimming.
Uses pyosmium's FileProcessor with area assembly to convert OSM ways/relations
into Shapely polygons, reprojects to BNG (EPSG:27700), and saves as parquet.
Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
import logging
from pathlib import Path
import osmium
import polars as pl
from pyproj import Transformer
from shapely import make_valid, wkb
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
logger = logging.getLogger(__name__)
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
# OSM tags that indicate non-residential green/water areas
GREENSPACE_TAGS = {
"leisure": {"park", "nature_reserve", "garden", "common"},
"landuse": {"forest"},
"natural": {"wood", "water", "wetland"},
"waterway": {"riverbank"},
}
_to_bng = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _to_bng_polygon(geom):
"""Reproject a WGS84 Shapely geometry to BNG (EPSG:27700)."""
def transform_ring(coords):
xs, ys = zip(*coords)
bng_x, bng_y = _to_bng.transform(list(xs), list(ys))
return list(zip(bng_x, bng_y))
if geom.geom_type == "Polygon":
exterior = transform_ring(geom.exterior.coords)
holes = [transform_ring(h.coords) for h in geom.interiors]
return Polygon(exterior, holes)
elif geom.geom_type == "MultiPolygon":
parts = []
for p in geom.geoms:
exterior = transform_ring(p.exterior.coords)
holes = [transform_ring(h.coords) for h in p.interiors]
parts.append(Polygon(exterior, holes))
return MultiPolygon(parts)
return geom
def _polygonal_part(geom):
"""The Polygon/MultiPolygon content of a geometry, or None if there is none."""
if geom.geom_type in ("Polygon", "MultiPolygon"):
return geom
if geom.geom_type == "GeometryCollection":
polygons = []
for part in geom.geoms:
if part.geom_type == "Polygon":
polygons.append(part)
elif part.geom_type == "MultiPolygon":
polygons.extend(part.geoms)
if polygons:
return MultiPolygon(polygons)
return None
def _matches_tags(tags):
"""Check if an OSM element's tags match our greenspace/water criteria."""
for key, values in GREENSPACE_TAGS.items():
if key in tags and tags[key] in values:
return True
return False
class GreenspaceHandler(osmium.SimpleHandler):
"""Collects area geometries matching greenspace/water tags."""
def __init__(self, progress):
super().__init__()
self._wkb_factory = osmium.geom.WKBFactory()
self._progress = progress
self.geometries = []
self.skipped_areas = 0
def area(self, a):
self._progress.update(1)
if not _matches_tags(a.tags):
return
try:
wkb_data = self._wkb_factory.create_multipolygon(a)
geom = wkb.loads(wkb_data, hex=True)
except (RuntimeError, GEOSException, ValueError) as exc:
self.skipped_areas += 1
logger.warning(
"Failed to assemble multipolygon for area orig_id=%s (%s)",
getattr(a, "orig_id", lambda: "?")(),
type(exc).__name__,
exc_info=True,
)
return
# Invalid geometries are often the largest, most complex park/water
# multipolygons (self-touching rings from OSM) — repair like pois.py
# rather than silently dropping them. make_valid may return a
# GeometryCollection with stray lines/points; keep only the polygons.
if not geom.is_valid:
geom = _polygonal_part(make_valid(geom))
if geom is None or geom.is_empty:
return
# Reproject to BNG for area calculation
bng_geom = _to_bng_polygon(geom)
if bng_geom.area < MIN_AREA_SQM:
return
self.geometries.append(bng_geom.wkb)
def main():
parser = argparse.ArgumentParser(
description="Extract park/water polygons from OSM PBF"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to existing PBF file"
)
args = parser.parse_args()
pbf_file = args.pbf
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
with tqdm(
unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05
) as progress:
handler = GreenspaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
)
if handler.skipped_areas:
logger.warning(
"Skipped %d areas due to geometry assembly errors",
handler.skipped_areas,
)
# Merge overlapping geometries per 10km grid cell for efficiency
if handler.geometries:
# Save WKB geometries directly
df = pl.DataFrame({"geometry": handler.geometries})
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")
else:
print("No geometries found — skipping output")
if __name__ == "__main__":
main()