seems alright

This commit is contained in:
Andras Schmelczer 2026-05-17 13:52:11 +01:00
parent ebe7bbb51d
commit eac1bd0d13
58 changed files with 23125 additions and 153505 deletions

View file

@ -7,15 +7,19 @@ Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
import logging
from pathlib import Path
import osmium
import polars as pl
from pyproj import Transformer
from shapely import wkb
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
logger = logging.getLogger(__name__)
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
@ -68,6 +72,7 @@ class GreenspaceHandler(osmium.SimpleHandler):
self._wkb_factory = osmium.geom.WKBFactory()
self._progress = progress
self.geometries = []
self.skipped_areas = 0
def area(self, a):
self._progress.update(1)
@ -76,7 +81,14 @@ class GreenspaceHandler(osmium.SimpleHandler):
try:
wkb_data = self._wkb_factory.create_multipolygon(a)
geom = wkb.loads(wkb_data, hex=True)
except Exception:
except (RuntimeError, GEOSException, ValueError) as exc:
self.skipped_areas += 1
logger.warning(
"Failed to assemble multipolygon for area orig_id=%s (%s)",
getattr(a, "orig_id", lambda: "?")(),
type(exc).__name__,
exc_info=True,
)
return
if geom.is_empty or not geom.is_valid:
@ -113,6 +125,11 @@ def main():
print(
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
)
if handler.skipped_areas:
logger.warning(
"Skipped %d areas due to geometry assembly errors",
handler.skipped_areas,
)
# Merge overlapping geometries per 10km grid cell for efficiency
if handler.geometries:

View file

@ -14,6 +14,7 @@ License: Open Government Licence v3.0
"""
import argparse
import logging
import tempfile
from pathlib import Path
@ -21,10 +22,13 @@ import numpy as np
import polars as pl
import shapefile as shp
from pyproj import Transformer
from shapely.errors import GEOSException
from shapely.geometry import shape as to_shapely
from pipeline.utils.download import download, extract_zip
logger = logging.getLogger(__name__)
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
@ -76,6 +80,7 @@ def _read_access_points(
lngs: list[float] = []
categories: list[str] = []
skipped = 0
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[ref_idx]
@ -89,7 +94,13 @@ def _read_access_points(
if geom.is_empty:
continue
lng, lat = _to_wgs84.transform(geom.x, geom.y)
except Exception:
except (GEOSException, ValueError, AttributeError, TypeError):
error_skipped += 1
logger.warning(
"Failed to process access point geometry for site_id=%s",
site_id,
exc_info=True,
)
continue
lats.append(lat)
@ -98,6 +109,11 @@ def _read_access_points(
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
if error_skipped:
logger.warning(
"Skipped %d access point records due to geometry/transform errors",
error_skipped,
)
return lats, lngs, categories
@ -116,6 +132,7 @@ def _read_site_centroids(
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
@ -129,13 +146,25 @@ def _read_site_centroids(
continue
centroid = geom.centroid
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
except Exception:
except (GEOSException, ValueError, AttributeError, TypeError):
error_skipped += 1
logger.warning(
"Failed to compute centroid for site_id=%s",
site_id,
exc_info=True,
)
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
if error_skipped:
logger.warning(
"Skipped %d site centroid records due to geometry/transform errors",
error_skipped,
)
return lats, lngs, categories

View file

@ -1,10 +1,12 @@
import argparse
import logging
from pathlib import Path
from tempfile import mkdtemp
import osmium
import polars as pl
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
from shapely.wkb import loads as load_wkb
from tqdm import tqdm
@ -17,6 +19,8 @@ from pipeline.utils.england_geometry import (
load_england_polygon,
)
logger = logging.getLogger(__name__)
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
@ -57,6 +61,7 @@ class POIHandler(osmium.SimpleHandler):
self._tmp_dir = tmp_dir
self._batch_num = 0
self.poi_count = 0
self.skipped_areas = 0
self._progress = progress
self._england = england_polygon
self._wkb_factory = osmium.geom.WKBFactory()
@ -120,7 +125,14 @@ class POIHandler(osmium.SimpleHandler):
def _point_from_area(self, area: osmium.osm.Area) -> tuple[float, float] | None:
try:
geom = load_wkb(self._wkb_factory.create_multipolygon(area), hex=True)
except Exception:
except (RuntimeError, GEOSException, ValueError) as exc:
self.skipped_areas += 1
logger.warning(
"Failed to build multipolygon WKB for area orig_id=%s (%s)",
getattr(area, "orig_id", lambda: "?")(),
type(exc).__name__,
exc_info=True,
)
return None
return _representative_lat_lon(geom, self._england)
@ -185,6 +197,11 @@ def main() -> None:
handler._flush_batch() # write any remaining POIs
print(f"Extracted {handler.poi_count:,} POIs")
if handler.skipped_areas:
logger.warning(
"Skipped %d areas due to geometry assembly errors",
handler.skipped_areas,
)
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
df = pl.concat([pl.scan_parquet(f) for f in batch_files])