Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -3,7 +3,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely import wkb
|
||||
from shapely import make_valid, wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
|
|
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
|
|||
def load_greenspace(path: Path) -> tuple[STRtree, list]:
|
||||
"""Load greenspace parquet and build an STRtree spatial index.
|
||||
|
||||
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
|
||||
polygon would make the per-postcode ``intersects`` predicate (and the exact
|
||||
difference path) liable to raise mid-merge, hours into a build. Empty
|
||||
geometries are dropped.
|
||||
|
||||
Returns:
|
||||
(tree, geoms) where tree is a Shapely STRtree and geoms is
|
||||
the list of geometries indexed by the tree.
|
||||
"""
|
||||
df = pl.read_parquet(path)
|
||||
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
|
||||
geoms = []
|
||||
for raw in df["geometry"].to_list():
|
||||
geom = wkb.loads(raw)
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if not geom.is_empty:
|
||||
geoms.append(geom)
|
||||
tree = STRtree(geoms)
|
||||
return tree, geoms
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue