Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -3,7 +3,7 @@
from pathlib import Path
import polars as pl
from shapely import wkb
from shapely import make_valid, wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.strtree import STRtree
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index.
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
polygon would make the per-postcode ``intersects`` predicate (and the exact
difference path) liable to raise mid-merge, hours into a build. Empty
geometries are dropped.
Returns:
(tree, geoms) where tree is a Shapely STRtree and geoms is
the list of geometries indexed by the tree.
"""
df = pl.read_parquet(path)
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
geoms = []
for raw in df["geometry"].to_list():
geom = wkb.loads(raw)
if not geom.is_valid:
geom = make_valid(geom)
if not geom.is_empty:
geoms.append(geom)
tree = STRtree(geoms)
return tree, geoms