idk2
This commit is contained in:
parent
d43da9708c
commit
fbfebc651c
5 changed files with 295 additions and 0 deletions
79
pipeline/transform/postcode_boundaries/fragments_cache.py
Normal file
79
pipeline/transform/postcode_boundaries/fragments_cache.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
"""Persist Phase-3 output (the per-postcode fragments) so a crash in the later
|
||||
merge/write phases can resume in seconds instead of re-running the ~10-hour OA
|
||||
loop.
|
||||
|
||||
Phase 3 turns OA boundaries + INSPIRE parcels + UPRN points into ~1.8M
|
||||
``(postcode, geometry)`` fragments held only in memory. Everything after it
|
||||
(merge, simplify, GeoJSON write) is cheap but failure-prone -- a single
|
||||
degenerate postcode used to abort the whole run *after* those 10 hours. Caching
|
||||
the fragments to disk decouples the expensive computation from the fragile
|
||||
output stage.
|
||||
|
||||
Fragments are stored as one parquet file with two columns: ``postcode``
|
||||
(string) and ``wkb`` (binary Shapely WKB). Writes are atomic (temp file +
|
||||
``os.replace``) so an interrupted write never leaves a cache that passes the
|
||||
freshness check. The cache is validated against its upstream inputs by mtime: if
|
||||
any input is newer than the cache it is treated as stale and ignored, mirroring
|
||||
make's own freshness logic.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import shapely
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
|
||||
Fragment = tuple[str, BaseGeometry]
|
||||
|
||||
|
||||
def _tmp_path(cache_path: Path) -> Path:
|
||||
return cache_path.parent / (cache_path.name + ".tmp")
|
||||
|
||||
|
||||
def fragments_cache_is_fresh(
|
||||
cache_path: Path, inputs: list[Path | None]
|
||||
) -> bool:
|
||||
"""True if ``cache_path`` exists and is newer than every input that exists.
|
||||
|
||||
A missing input is ignored (it cannot have changed the cache); a ``None``
|
||||
input is skipped. Any existing input newer than the cache marks it stale.
|
||||
"""
|
||||
if not cache_path.exists():
|
||||
return False
|
||||
cache_mtime = cache_path.stat().st_mtime
|
||||
for inp in inputs:
|
||||
if inp is None:
|
||||
continue
|
||||
path = Path(inp)
|
||||
if path.exists() and path.stat().st_mtime > cache_mtime:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def save_fragments(cache_path: Path, fragments: list[Fragment]) -> None:
|
||||
"""Atomically write ``(postcode, geometry)`` fragments to a parquet cache."""
|
||||
postcodes = [pc for pc, _ in fragments]
|
||||
geoms = np.array([geom for _, geom in fragments], dtype=object)
|
||||
wkb = shapely.to_wkb(geoms)
|
||||
|
||||
frame = pl.DataFrame(
|
||||
{"postcode": postcodes, "wkb": list(wkb)},
|
||||
schema={"postcode": pl.Utf8, "wkb": pl.Binary},
|
||||
)
|
||||
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = _tmp_path(cache_path)
|
||||
frame.write_parquet(tmp, compression="zstd")
|
||||
os.replace(tmp, cache_path)
|
||||
|
||||
|
||||
def load_fragments(cache_path: Path) -> list[Fragment]:
|
||||
"""Read fragments written by :func:`save_fragments` back into memory."""
|
||||
frame = pl.read_parquet(cache_path)
|
||||
postcodes = frame["postcode"].to_list()
|
||||
geoms = shapely.from_wkb(frame["wkb"].to_list())
|
||||
return list(zip(postcodes, geoms))
|
||||
Loading…
Add table
Add a link
Reference in a new issue