Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -260,6 +260,12 @@ def main() -> None:
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.greenspace and not args.greenspace.exists():
|
||||
# Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
|
||||
# the subtraction is exactly how parks/lakes shipped inside postcode
|
||||
# boundaries unnoticed.
|
||||
raise SystemExit(f"--greenspace file not found: {args.greenspace}")
|
||||
|
||||
fragments_cache = args.output / "fragments_cache.parquet"
|
||||
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
|
||||
# so a greenspace change must not invalidate the fragment cache.
|
||||
|
|
@ -294,7 +300,7 @@ def main() -> None:
|
|||
|
||||
greenspace_tree = None
|
||||
greenspace_geoms = None
|
||||
if args.greenspace and args.greenspace.exists():
|
||||
if args.greenspace:
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
print(f" Loading greenspace/water from {args.greenspace}...")
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely import wkb
|
||||
from shapely import make_valid, wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
|
|
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
|
|||
def load_greenspace(path: Path) -> tuple[STRtree, list]:
|
||||
"""Load greenspace parquet and build an STRtree spatial index.
|
||||
|
||||
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
|
||||
polygon would make the per-postcode ``intersects`` predicate (and the exact
|
||||
difference path) liable to raise mid-merge, hours into a build. Empty
|
||||
geometries are dropped.
|
||||
|
||||
Returns:
|
||||
(tree, geoms) where tree is a Shapely STRtree and geoms is
|
||||
the list of geometries indexed by the tree.
|
||||
"""
|
||||
df = pl.read_parquet(path)
|
||||
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
|
||||
geoms = []
|
||||
for raw in df["geometry"].to_list():
|
||||
geom = wkb.loads(raw)
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if not geom.is_empty:
|
||||
geoms.append(geom)
|
||||
tree = STRtree(geoms)
|
||||
return tree, geoms
|
||||
|
||||
|
|
|
|||
|
|
@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
|||
return geojson_dict
|
||||
|
||||
|
||||
def _is_pointlike(geom_bng) -> bool:
|
||||
"""True if a BNG geometry carries no real extent (tower-block signature).
|
||||
|
||||
Near-zero area AND short perimeter together distinguish a collapsed point
|
||||
from a genuine thin sliver, which still carries length.
|
||||
"""
|
||||
try:
|
||||
return (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
)
|
||||
except GEOSException:
|
||||
return False
|
||||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
|
||||
|
||||
|
|
@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
|
|||
gets a building-scale buffer so it is not reduced to an invisible sub-metre
|
||||
dot; thin slivers that still carry length keep the minimal buffer.
|
||||
"""
|
||||
buffer_m = _MIN_FOOTPRINT_BUFFER_M
|
||||
try:
|
||||
if (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
):
|
||||
buffer_m = _POINT_RESCUE_BUFFER_M
|
||||
except GEOSException:
|
||||
pass
|
||||
buffer_m = (
|
||||
_POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
|
||||
)
|
||||
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
|
||||
if footprint is None:
|
||||
return None
|
||||
|
|
@ -147,10 +156,16 @@ def to_wgs84_geojson(
|
|||
)
|
||||
if simplified is None:
|
||||
simplified = cleaned
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
if _is_pointlike(simplified):
|
||||
# A POINTLIKE footprint is rescued to building scale even when it
|
||||
# would survive snapping: a 0.1-1 m² polygon serializes fine but
|
||||
# ships as an invisible dot covering a whole tower block.
|
||||
result = _rescue_footprint(simplified)
|
||||
else:
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
result = _rescue_footprint(simplified)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
|
|
@ -229,6 +244,10 @@ def merge_fragments(
|
|||
greenspace_tree: Optional STRtree of park/water polygons.
|
||||
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
|
||||
"""
|
||||
subtract = greenspace_tree is not None and greenspace_geoms is not None
|
||||
if subtract:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
by_postcode: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in all_fragments:
|
||||
by_postcode[pc].append(geom)
|
||||
|
|
@ -256,9 +275,7 @@ def merge_fragments(
|
|||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
if greenspace_tree is not None and greenspace_geoms is not None:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
if subtract:
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _keep_polygon_parts(combined)
|
||||
|
|
|
|||
|
|
@ -921,6 +921,49 @@ class TestToWgs84Geojson:
|
|||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
|
||||
|
||||
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
|
||||
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
|
||||
0.86 m²) must NOT ship as-is just because it survives precision snapping;
|
||||
pointlike inputs are rescued to a ~201 m² disc unconditionally."""
|
||||
import pyproj
|
||||
from shapely.geometry import shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
|
||||
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
|
||||
tiny = box(530000, 180000, 530000.9, 180000.9)
|
||||
from .output import _snap_to_wgs84_geojson
|
||||
|
||||
assert _snap_to_wgs84_geojson(tiny) is not None, (
|
||||
"precondition: this polygon must be snappable, otherwise the test "
|
||||
"exercises the old snap-fails path instead of the new one"
|
||||
)
|
||||
result = to_wgs84_geojson(tiny)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert 150 < area_m2 < 300, (
|
||||
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
|
||||
"instead of a building-scale (~201 m^2) disc"
|
||||
)
|
||||
|
||||
def test_normal_polygon_area_unchanged(self):
|
||||
"""A normal polygon must pass through without rescue inflation."""
|
||||
import pyproj
|
||||
from shapely.geometry import shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
|
||||
result = to_wgs84_geojson(poly)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 == pytest.approx(10_000, rel=0.01)
|
||||
|
||||
def test_thin_sliver_keeps_minimal_buffer(self):
|
||||
"""A genuine elongated sliver still carries length, so it is NOT inflated
|
||||
to building scale — only truly pointlike inputs are."""
|
||||
|
|
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
|
|||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
||||
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
|
||||
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
|
||||
load: it would otherwise make the per-postcode intersects/difference
|
||||
liable to raise hours into a merge."""
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
|
||||
assert not bowtie.is_valid
|
||||
valid = box(20, 20, 30, 30)
|
||||
path = tmp_path / "greenspace.parquet"
|
||||
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
|
||||
|
||||
tree, geoms = load_greenspace(path)
|
||||
assert len(geoms) == 2
|
||||
assert all(g.is_valid and not g.is_empty for g in geoms)
|
||||
# The repaired bow-tie must still subtract cleanly.
|
||||
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
|
||||
assert result.is_valid
|
||||
assert result.area < 10_000
|
||||
|
||||
|
||||
class TestToWgs84GeojsonValidity:
|
||||
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue