Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
from shapely.ops import unary_union
|
||||
|
|
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
|
|||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
|
||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||
just the intermediate Shapely object: coordinate snapping during
|
||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||
once the feature is read back from disk. Any such geometry is repaired with
|
||||
``make_valid`` before returning so written features are always valid.
|
||||
"""
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
|
@ -55,12 +63,28 @@ def to_wgs84_geojson(
|
|||
|
||||
transformer = _get_to_wgs84()
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
try:
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
except GEOSException:
|
||||
# Precision snapping can fail on pathological geometries; fall back to a
|
||||
# plain validity repair without coordinate snapping.
|
||||
wgs84 = make_valid(wgs84)
|
||||
wgs84 = _largest_polygonal(wgs84)
|
||||
if wgs84 is None:
|
||||
return None
|
||||
|
||||
return mapping(wgs84)
|
||||
geojson_dict = mapping(wgs84)
|
||||
|
||||
# The geometry that actually reaches disk is the GeoJSON dict, so validate
|
||||
# *that* (not the pre-serialization object) and repair if needed.
|
||||
round_trip = shape(geojson_dict)
|
||||
if round_trip.is_empty or not round_trip.is_valid:
|
||||
round_trip = _largest_polygonal(make_valid(round_trip))
|
||||
if round_trip is None or round_trip.is_empty:
|
||||
return None
|
||||
geojson_dict = mapping(round_trip)
|
||||
|
||||
return geojson_dict
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
|
|
@ -119,7 +143,11 @@ def merge_fragments(
|
|||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _fill_holes(combined)
|
||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||
# Filling them would re-add the removed area and negate the
|
||||
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
|
||||
# chain were already removed by the _fill_holes above (pre-subtraction).
|
||||
# Revert if subtraction + fragment selection lost >90% of area
|
||||
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
||||
combined = pre_green
|
||||
|
|
|
|||
|
|
@ -893,3 +893,54 @@ class TestSubtractGreenspace:
|
|||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
||||
|
||||
class TestToWgs84GeojsonValidity:
|
||||
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
|
||||
|
||||
def test_geojson_round_trips_to_valid_geometry(self):
|
||||
from shapely.geometry import shape
|
||||
|
||||
geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
|
||||
assert geojson is not None
|
||||
rt = shape(geojson)
|
||||
assert not rt.is_empty
|
||||
assert rt.is_valid
|
||||
|
||||
def test_written_district_features_are_all_valid(self, tmp_path):
|
||||
from shapely.geometry import shape
|
||||
|
||||
postcodes = {
|
||||
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||
"AA1 1AB": MultiPolygon(
|
||||
[
|
||||
box(530200, 180000, 530250, 180050),
|
||||
box(530200, 180060, 530250, 180110),
|
||||
]
|
||||
),
|
||||
}
|
||||
assert write_district_geojson(postcodes, tmp_path) == 1
|
||||
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
for feature in collection["features"]:
|
||||
geom = shape(feature["geometry"])
|
||||
assert geom.is_valid
|
||||
assert not geom.is_empty
|
||||
|
||||
|
||||
class TestGreenspaceHolePreserved:
|
||||
"""Interior holes carved by greenspace subtraction must survive merge_fragments
|
||||
(the post-subtraction _fill_holes that previously negated them was removed)."""
|
||||
|
||||
def test_interior_lake_hole_survives_merge_fragments(self):
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
lake = box(30, 30, 70, 70) # 1600 sqm fully-interior hole (16% removal)
|
||||
result = merge_fragments(
|
||||
[("TEST1", postcode)],
|
||||
greenspace_tree=STRtree([lake]),
|
||||
greenspace_geoms=[lake],
|
||||
)
|
||||
merged = result["TEST1"]
|
||||
assert len(list(merged.interiors)) == 1
|
||||
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue