Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -921,6 +921,49 @@ class TestToWgs84Geojson:
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
0.86 ) must NOT ship as-is just because it survives precision snapping;
pointlike inputs are rescued to a ~201 disc unconditionally."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
tiny = box(530000, 180000, 530000.9, 180000.9)
from .output import _snap_to_wgs84_geojson
assert _snap_to_wgs84_geojson(tiny) is not None, (
"precondition: this polygon must be snappable, otherwise the test "
"exercises the old snap-fails path instead of the new one"
)
result = to_wgs84_geojson(tiny)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert 150 < area_m2 < 300, (
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
"instead of a building-scale (~201 m^2) disc"
)
def test_normal_polygon_area_unchanged(self):
"""A normal polygon must pass through without rescue inflation."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
result = to_wgs84_geojson(poly)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 == pytest.approx(10_000, rel=0.01)
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
load: it would otherwise make the per-postcode intersects/difference
liable to raise hours into a merge."""
from .greenspace import load_greenspace
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
assert not bowtie.is_valid
valid = box(20, 20, 30, 30)
path = tmp_path / "greenspace.parquet"
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
tree, geoms = load_greenspace(path)
assert len(geoms) == 2
assert all(g.is_valid and not g.is_empty for g in geoms)
# The repaired bow-tie must still subtract cleanly.
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
assert result.is_valid
assert result.area < 10_000
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""