Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -260,6 +260,12 @@ def main() -> None:
)
args = parser.parse_args()
if args.greenspace and not args.greenspace.exists():
# Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
# the subtraction is exactly how parks/lakes shipped inside postcode
# boundaries unnoticed.
raise SystemExit(f"--greenspace file not found: {args.greenspace}")
fragments_cache = args.output / "fragments_cache.parquet"
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
# so a greenspace change must not invalidate the fragment cache.
@ -294,7 +300,7 @@ def main() -> None:
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
if args.greenspace:
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")

View file

@ -3,7 +3,7 @@
from pathlib import Path
import polars as pl
from shapely import wkb
from shapely import make_valid, wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.strtree import STRtree
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index.
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
polygon would make the per-postcode ``intersects`` predicate (and the exact
difference path) liable to raise mid-merge, hours into a build. Empty
geometries are dropped.
Returns:
(tree, geoms) where tree is a Shapely STRtree and geoms is
the list of geometries indexed by the tree.
"""
df = pl.read_parquet(path)
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
geoms = []
for raw in df["geometry"].to_list():
geom = wkb.loads(raw)
if not geom.is_valid:
geom = make_valid(geom)
if not geom.is_empty:
geoms.append(geom)
tree = STRtree(geoms)
return tree, geoms

View file

@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
return geojson_dict
def _is_pointlike(geom_bng) -> bool:
"""True if a BNG geometry carries no real extent (tower-block signature).
Near-zero area AND short perimeter together distinguish a collapsed point
from a genuine thin sliver, which still carries length.
"""
try:
return (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
)
except GEOSException:
return False
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
gets a building-scale buffer so it is not reduced to an invisible sub-metre
dot; thin slivers that still carry length keep the minimal buffer.
"""
buffer_m = _MIN_FOOTPRINT_BUFFER_M
try:
if (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
):
buffer_m = _POINT_RESCUE_BUFFER_M
except GEOSException:
pass
buffer_m = (
_POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
)
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
if footprint is None:
return None
@ -147,10 +156,16 @@ def to_wgs84_geojson(
)
if simplified is None:
simplified = cleaned
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
if _is_pointlike(simplified):
# A POINTLIKE footprint is rescued to building scale even when it
# would survive snapping: a 0.1-1 m² polygon serializes fine but
# ships as an invisible dot covering a whole tower block.
result = _rescue_footprint(simplified)
else:
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
result = _rescue_footprint(simplified)
if result is not None:
return result
@ -229,6 +244,10 @@ def merge_fragments(
greenspace_tree: Optional STRtree of park/water polygons.
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
"""
subtract = greenspace_tree is not None and greenspace_geoms is not None
if subtract:
from .greenspace import subtract_greenspace
by_postcode: dict[str, list] = defaultdict(list)
for pc, geom in all_fragments:
by_postcode[pc].append(geom)
@ -256,9 +275,7 @@ def merge_fragments(
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
if greenspace_tree is not None and greenspace_geoms is not None:
from .greenspace import subtract_greenspace
if subtract:
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _keep_polygon_parts(combined)

View file

@ -921,6 +921,49 @@ class TestToWgs84Geojson:
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
0.86 ) must NOT ship as-is just because it survives precision snapping;
pointlike inputs are rescued to a ~201 disc unconditionally."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
tiny = box(530000, 180000, 530000.9, 180000.9)
from .output import _snap_to_wgs84_geojson
assert _snap_to_wgs84_geojson(tiny) is not None, (
"precondition: this polygon must be snappable, otherwise the test "
"exercises the old snap-fails path instead of the new one"
)
result = to_wgs84_geojson(tiny)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert 150 < area_m2 < 300, (
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
"instead of a building-scale (~201 m^2) disc"
)
def test_normal_polygon_area_unchanged(self):
"""A normal polygon must pass through without rescue inflation."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
result = to_wgs84_geojson(poly)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 == pytest.approx(10_000, rel=0.01)
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
load: it would otherwise make the per-postcode intersects/difference
liable to raise hours into a merge."""
from .greenspace import load_greenspace
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
assert not bowtie.is_valid
valid = box(20, 20, 30, 30)
path = tmp_path / "greenspace.parquet"
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
tree, geoms = load_greenspace(path)
assert len(geoms) == 2
assert all(g.is_valid and not g.is_empty for g in geoms)
# The repaired bow-tie must still subtract cleanly.
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
assert result.is_valid
assert result.area < 10_000
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""