Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -5,6 +5,7 @@ from pathlib import Path
from pyproj import Transformer
from shapely import make_valid, set_precision
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
from shapely.ops import unary_union
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
just the intermediate Shapely object: coordinate snapping during
serialization can otherwise leave a self-intersecting ring that only shows up
once the feature is read back from disk. Any such geometry is repaired with
``make_valid`` before returning so written features are always valid.
"""
geom = _largest_polygonal(geom)
if geom is None:
return None
@ -55,12 +63,28 @@ def to_wgs84_geojson(
transformer = _get_to_wgs84()
wgs84 = transform_geometry(transformer.transform, simplified)
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
try:
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
except GEOSException:
# Precision snapping can fail on pathological geometries; fall back to a
# plain validity repair without coordinate snapping.
wgs84 = make_valid(wgs84)
wgs84 = _largest_polygonal(wgs84)
if wgs84 is None:
return None
return mapping(wgs84)
geojson_dict = mapping(wgs84)
# The geometry that actually reaches disk is the GeoJSON dict, so validate
# *that* (not the pre-serialization object) and repair if needed.
round_trip = shape(geojson_dict)
if round_trip.is_empty or not round_trip.is_valid:
round_trip = _largest_polygonal(make_valid(round_trip))
if round_trip is None or round_trip.is_empty:
return None
geojson_dict = mapping(round_trip)
return geojson_dict
def _fill_holes(geom):
@ -119,7 +143,11 @@ def merge_fragments(
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _fill_holes(combined)
# Do NOT _fill_holes here: interior holes carved by the greenspace
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
# Filling them would re-add the removed area and negate the
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
# chain were already removed by the _fill_holes above (pre-subtraction).
# Revert if subtraction + fragment selection lost >90% of area
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
combined = pre_green