seems alright
This commit is contained in:
parent
ebe7bbb51d
commit
eac1bd0d13
58 changed files with 23125 additions and 153505 deletions
|
|
@ -7,15 +7,19 @@ Reuses the same england-latest.osm.pbf as pois.py.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
from shapely import wkb
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
|
||||
|
||||
|
|
@ -68,6 +72,7 @@ class GreenspaceHandler(osmium.SimpleHandler):
|
|||
self._wkb_factory = osmium.geom.WKBFactory()
|
||||
self._progress = progress
|
||||
self.geometries = []
|
||||
self.skipped_areas = 0
|
||||
|
||||
def area(self, a):
|
||||
self._progress.update(1)
|
||||
|
|
@ -76,7 +81,14 @@ class GreenspaceHandler(osmium.SimpleHandler):
|
|||
try:
|
||||
wkb_data = self._wkb_factory.create_multipolygon(a)
|
||||
geom = wkb.loads(wkb_data, hex=True)
|
||||
except Exception:
|
||||
except (RuntimeError, GEOSException, ValueError) as exc:
|
||||
self.skipped_areas += 1
|
||||
logger.warning(
|
||||
"Failed to assemble multipolygon for area orig_id=%s (%s)",
|
||||
getattr(a, "orig_id", lambda: "?")(),
|
||||
type(exc).__name__,
|
||||
exc_info=True,
|
||||
)
|
||||
return
|
||||
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
|
|
@ -113,6 +125,11 @@ def main():
|
|||
print(
|
||||
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
|
||||
)
|
||||
if handler.skipped_areas:
|
||||
logger.warning(
|
||||
"Skipped %d areas due to geometry assembly errors",
|
||||
handler.skipped_areas,
|
||||
)
|
||||
|
||||
# Merge overlapping geometries per 10km grid cell for efficiency
|
||||
if handler.geometries:
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ License: Open Government Licence v3.0
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -21,10 +22,13 @@ import numpy as np
|
|||
import polars as pl
|
||||
import shapefile as shp
|
||||
from pyproj import Transformer
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import shape as to_shapely
|
||||
|
||||
from pipeline.utils.download import download, extract_zip
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
|
||||
|
||||
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
|
@ -76,6 +80,7 @@ def _read_access_points(
|
|||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
skipped = 0
|
||||
error_skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[ref_idx]
|
||||
|
|
@ -89,7 +94,13 @@ def _read_access_points(
|
|||
if geom.is_empty:
|
||||
continue
|
||||
lng, lat = _to_wgs84.transform(geom.x, geom.y)
|
||||
except Exception:
|
||||
except (GEOSException, ValueError, AttributeError, TypeError):
|
||||
error_skipped += 1
|
||||
logger.warning(
|
||||
"Failed to process access point geometry for site_id=%s",
|
||||
site_id,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
|
|
@ -98,6 +109,11 @@ def _read_access_points(
|
|||
|
||||
if skipped:
|
||||
print(f" Skipped {skipped:,} access points with unknown site ID")
|
||||
if error_skipped:
|
||||
logger.warning(
|
||||
"Skipped %d access point records due to geometry/transform errors",
|
||||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
|
||||
|
|
@ -116,6 +132,7 @@ def _read_site_centroids(
|
|||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
error_skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[id_idx]
|
||||
|
|
@ -129,13 +146,25 @@ def _read_site_centroids(
|
|||
continue
|
||||
centroid = geom.centroid
|
||||
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
|
||||
except Exception:
|
||||
except (GEOSException, ValueError, AttributeError, TypeError):
|
||||
error_skipped += 1
|
||||
logger.warning(
|
||||
"Failed to compute centroid for site_id=%s",
|
||||
site_id,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
|
||||
if error_skipped:
|
||||
logger.warning(
|
||||
"Skipped %d site centroid records due to geometry/transform errors",
|
||||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely import make_valid
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import Point
|
||||
from shapely.wkb import loads as load_wkb
|
||||
from tqdm import tqdm
|
||||
|
|
@ -17,6 +19,8 @@ from pipeline.utils.england_geometry import (
|
|||
load_england_polygon,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BATCH_SIZE = 50_000
|
||||
|
||||
MIN_OCCURENCE_COUNT = 20
|
||||
|
|
@ -57,6 +61,7 @@ class POIHandler(osmium.SimpleHandler):
|
|||
self._tmp_dir = tmp_dir
|
||||
self._batch_num = 0
|
||||
self.poi_count = 0
|
||||
self.skipped_areas = 0
|
||||
self._progress = progress
|
||||
self._england = england_polygon
|
||||
self._wkb_factory = osmium.geom.WKBFactory()
|
||||
|
|
@ -120,7 +125,14 @@ class POIHandler(osmium.SimpleHandler):
|
|||
def _point_from_area(self, area: osmium.osm.Area) -> tuple[float, float] | None:
|
||||
try:
|
||||
geom = load_wkb(self._wkb_factory.create_multipolygon(area), hex=True)
|
||||
except Exception:
|
||||
except (RuntimeError, GEOSException, ValueError) as exc:
|
||||
self.skipped_areas += 1
|
||||
logger.warning(
|
||||
"Failed to build multipolygon WKB for area orig_id=%s (%s)",
|
||||
getattr(area, "orig_id", lambda: "?")(),
|
||||
type(exc).__name__,
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
return _representative_lat_lon(geom, self._england)
|
||||
|
||||
|
|
@ -185,6 +197,11 @@ def main() -> None:
|
|||
handler._flush_batch() # write any remaining POIs
|
||||
|
||||
print(f"Extracted {handler.poi_count:,} POIs")
|
||||
if handler.skipped_areas:
|
||||
logger.warning(
|
||||
"Skipped %d areas due to geometry assembly errors",
|
||||
handler.skipped_areas,
|
||||
)
|
||||
|
||||
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
|
||||
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue