perfect-postcode/pipeline/download/greenspace_water.py

129 lines
3.9 KiB
Python

"""Extract park and water body polygons from OSM PBF for postcode boundary trimming.
Uses pyosmium's FileProcessor with area assembly to convert OSM ways/relations
into Shapely polygons, reprojects to BNG (EPSG:27700), and saves as parquet.
Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
from pathlib import Path
import osmium
import polars as pl
from pyproj import Transformer
from shapely import wkb
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
# OSM tags that indicate non-residential green/water areas
GREENSPACE_TAGS = {
"leisure": {"park", "nature_reserve", "garden", "common"},
"landuse": {"forest"},
"natural": {"wood", "water", "wetland"},
"waterway": {"riverbank"},
}
_to_bng = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _to_bng_polygon(geom):
"""Reproject a WGS84 Shapely geometry to BNG (EPSG:27700)."""
def transform_ring(coords):
xs, ys = zip(*coords)
bng_x, bng_y = _to_bng.transform(list(xs), list(ys))
return list(zip(bng_x, bng_y))
if geom.geom_type == "Polygon":
exterior = transform_ring(geom.exterior.coords)
holes = [transform_ring(h.coords) for h in geom.interiors]
return Polygon(exterior, holes)
elif geom.geom_type == "MultiPolygon":
parts = []
for p in geom.geoms:
exterior = transform_ring(p.exterior.coords)
holes = [transform_ring(h.coords) for h in p.interiors]
parts.append(Polygon(exterior, holes))
return MultiPolygon(parts)
return geom
def _matches_tags(tags):
"""Check if an OSM element's tags match our greenspace/water criteria."""
for key, values in GREENSPACE_TAGS.items():
if key in tags and tags[key] in values:
return True
return False
class GreenspaceHandler(osmium.SimpleHandler):
"""Collects area geometries matching greenspace/water tags."""
def __init__(self, progress):
super().__init__()
self._wkb_factory = osmium.geom.WKBFactory()
self._progress = progress
self.geometries = []
def area(self, a):
self._progress.update(1)
if not _matches_tags(a.tags):
return
try:
wkb_data = self._wkb_factory.create_multipolygon(a)
geom = wkb.loads(wkb_data, hex=True)
except Exception:
return
if geom.is_empty or not geom.is_valid:
return
# Reproject to BNG for area calculation
bng_geom = _to_bng_polygon(geom)
if bng_geom.area < MIN_AREA_SQM:
return
self.geometries.append(bng_geom.wkb)
def main():
parser = argparse.ArgumentParser(
description="Extract park/water polygons from OSM PBF"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to existing PBF file"
)
args = parser.parse_args()
pbf_file = args.pbf
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
with tqdm(
unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05
) as progress:
handler = GreenspaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
)
# Merge overlapping geometries per 10km grid cell for efficiency
if handler.geometries:
# Save WKB geometries directly
df = pl.DataFrame({"geometry": handler.geometries})
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")
else:
print("No geometries found — skipping output")
if __name__ == "__main__":
main()