"""Extract park and water body polygons from OSM PBF for postcode boundary trimming. Uses pyosmium's FileProcessor with area assembly to convert OSM ways/relations into Shapely polygons, reprojects to BNG (EPSG:27700), and saves as parquet. Reuses the same england-latest.osm.pbf as pois.py. """ import argparse from pathlib import Path import osmium import polars as pl from pyproj import Transformer from shapely import wkb from shapely.geometry import MultiPolygon, Polygon from tqdm import tqdm MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds # OSM tags that indicate non-residential green/water areas GREENSPACE_TAGS = { "leisure": {"park", "nature_reserve", "garden", "common"}, "landuse": {"forest"}, "natural": {"wood", "water", "wetland"}, "waterway": {"riverbank"}, } _to_bng = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) def _to_bng_polygon(geom): """Reproject a WGS84 Shapely geometry to BNG (EPSG:27700).""" def transform_ring(coords): xs, ys = zip(*coords) bng_x, bng_y = _to_bng.transform(list(xs), list(ys)) return list(zip(bng_x, bng_y)) if geom.geom_type == "Polygon": exterior = transform_ring(geom.exterior.coords) holes = [transform_ring(h.coords) for h in geom.interiors] return Polygon(exterior, holes) elif geom.geom_type == "MultiPolygon": parts = [] for p in geom.geoms: exterior = transform_ring(p.exterior.coords) holes = [transform_ring(h.coords) for h in p.interiors] parts.append(Polygon(exterior, holes)) return MultiPolygon(parts) return geom def _matches_tags(tags): """Check if an OSM element's tags match our greenspace/water criteria.""" for key, values in GREENSPACE_TAGS.items(): if key in tags and tags[key] in values: return True return False class GreenspaceHandler(osmium.SimpleHandler): """Collects area geometries matching greenspace/water tags.""" def __init__(self, progress): super().__init__() self._wkb_factory = osmium.geom.WKBFactory() self._progress = progress self.geometries = [] def area(self, a): self._progress.update(1) if not _matches_tags(a.tags): return try: wkb_data = self._wkb_factory.create_multipolygon(a) geom = wkb.loads(wkb_data, hex=True) except Exception: return if geom.is_empty or not geom.is_valid: return # Reproject to BNG for area calculation bng_geom = _to_bng_polygon(geom) if bng_geom.area < MIN_AREA_SQM: return self.geometries.append(bng_geom.wkb) def main(): parser = argparse.ArgumentParser( description="Extract park/water polygons from OSM PBF" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) parser.add_argument( "--pbf", type=Path, required=True, help="Path to existing PBF file" ) args = parser.parse_args() pbf_file = args.pbf print("Extracting greenspace/water areas from PBF (two-pass area assembly)...") with tqdm( unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05 ) as progress: handler = GreenspaceHandler(progress) handler.apply_file(str(pbf_file), locations=True) print( f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm" ) # Merge overlapping geometries per 10km grid cell for efficiency if handler.geometries: # Save WKB geometries directly df = pl.DataFrame({"geometry": handler.geometries}) args.output.parent.mkdir(parents=True, exist_ok=True) df.write_parquet(args.output) print(f"Saved to {args.output}") else: print("No geometries found — skipping output") if __name__ == "__main__": main()