79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
"""Persist Phase-3 output (the per-postcode fragments) so a crash in the later
|
|
merge/write phases can resume in seconds instead of re-running the ~10-hour OA
|
|
loop.
|
|
|
|
Phase 3 turns OA boundaries + INSPIRE parcels + UPRN points into ~1.8M
|
|
``(postcode, geometry)`` fragments held only in memory. Everything after it
|
|
(merge, simplify, GeoJSON write) is cheap but failure-prone -- a single
|
|
degenerate postcode used to abort the whole run *after* those 10 hours. Caching
|
|
the fragments to disk decouples the expensive computation from the fragile
|
|
output stage.
|
|
|
|
Fragments are stored as one parquet file with two columns: ``postcode``
|
|
(string) and ``wkb`` (binary Shapely WKB). Writes are atomic (temp file +
|
|
``os.replace``) so an interrupted write never leaves a cache that passes the
|
|
freshness check. The cache is validated against its upstream inputs by mtime: if
|
|
any input is newer than the cache it is treated as stale and ignored, mirroring
|
|
make's own freshness logic.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
import shapely
|
|
from shapely.geometry.base import BaseGeometry
|
|
|
|
Fragment = tuple[str, BaseGeometry]
|
|
|
|
|
|
def _tmp_path(cache_path: Path) -> Path:
|
|
return cache_path.parent / (cache_path.name + ".tmp")
|
|
|
|
|
|
def fragments_cache_is_fresh(
|
|
cache_path: Path, inputs: list[Path | None]
|
|
) -> bool:
|
|
"""True if ``cache_path`` exists and is newer than every input that exists.
|
|
|
|
A missing input is ignored (it cannot have changed the cache); a ``None``
|
|
input is skipped. Any existing input newer than the cache marks it stale.
|
|
"""
|
|
if not cache_path.exists():
|
|
return False
|
|
cache_mtime = cache_path.stat().st_mtime
|
|
for inp in inputs:
|
|
if inp is None:
|
|
continue
|
|
path = Path(inp)
|
|
if path.exists() and path.stat().st_mtime > cache_mtime:
|
|
return False
|
|
return True
|
|
|
|
|
|
def save_fragments(cache_path: Path, fragments: list[Fragment]) -> None:
|
|
"""Atomically write ``(postcode, geometry)`` fragments to a parquet cache."""
|
|
postcodes = [pc for pc, _ in fragments]
|
|
geoms = np.array([geom for _, geom in fragments], dtype=object)
|
|
wkb = shapely.to_wkb(geoms)
|
|
|
|
frame = pl.DataFrame(
|
|
{"postcode": postcodes, "wkb": list(wkb)},
|
|
schema={"postcode": pl.Utf8, "wkb": pl.Binary},
|
|
)
|
|
|
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp = _tmp_path(cache_path)
|
|
frame.write_parquet(tmp, compression="zstd")
|
|
os.replace(tmp, cache_path)
|
|
|
|
|
|
def load_fragments(cache_path: Path) -> list[Fragment]:
|
|
"""Read fragments written by :func:`save_fragments` back into memory."""
|
|
frame = pl.read_parquet(cache_path)
|
|
postcodes = frame["postcode"].to_list()
|
|
geoms = shapely.from_wkb(frame["wkb"].to_list())
|
|
return list(zip(postcodes, geoms))
|