perfect-postcode/pipeline/transform/postcode_boundaries/fragments_cache.py
2026-06-02 13:46:22 +01:00

79 lines
2.8 KiB
Python

"""Persist Phase-3 output (the per-postcode fragments) so a crash in the later
merge/write phases can resume in seconds instead of re-running the ~10-hour OA
loop.
Phase 3 turns OA boundaries + INSPIRE parcels + UPRN points into ~1.8M
``(postcode, geometry)`` fragments held only in memory. Everything after it
(merge, simplify, GeoJSON write) is cheap but failure-prone -- a single
degenerate postcode used to abort the whole run *after* those 10 hours. Caching
the fragments to disk decouples the expensive computation from the fragile
output stage.
Fragments are stored as one parquet file with two columns: ``postcode``
(string) and ``wkb`` (binary Shapely WKB). Writes are atomic (temp file +
``os.replace``) so an interrupted write never leaves a cache that passes the
freshness check. The cache is validated against its upstream inputs by mtime: if
any input is newer than the cache it is treated as stale and ignored, mirroring
make's own freshness logic.
"""
from __future__ import annotations
import os
from pathlib import Path
import numpy as np
import polars as pl
import shapely
from shapely.geometry.base import BaseGeometry
Fragment = tuple[str, BaseGeometry]
def _tmp_path(cache_path: Path) -> Path:
return cache_path.parent / (cache_path.name + ".tmp")
def fragments_cache_is_fresh(
cache_path: Path, inputs: list[Path | None]
) -> bool:
"""True if ``cache_path`` exists and is newer than every input that exists.
A missing input is ignored (it cannot have changed the cache); a ``None``
input is skipped. Any existing input newer than the cache marks it stale.
"""
if not cache_path.exists():
return False
cache_mtime = cache_path.stat().st_mtime
for inp in inputs:
if inp is None:
continue
path = Path(inp)
if path.exists() and path.stat().st_mtime > cache_mtime:
return False
return True
def save_fragments(cache_path: Path, fragments: list[Fragment]) -> None:
"""Atomically write ``(postcode, geometry)`` fragments to a parquet cache."""
postcodes = [pc for pc, _ in fragments]
geoms = np.array([geom for _, geom in fragments], dtype=object)
wkb = shapely.to_wkb(geoms)
frame = pl.DataFrame(
{"postcode": postcodes, "wkb": list(wkb)},
schema={"postcode": pl.Utf8, "wkb": pl.Binary},
)
cache_path.parent.mkdir(parents=True, exist_ok=True)
tmp = _tmp_path(cache_path)
frame.write_parquet(tmp, compression="zstd")
os.replace(tmp, cache_path)
def load_fragments(cache_path: Path) -> list[Fragment]:
"""Read fragments written by :func:`save_fragments` back into memory."""
frame = pl.read_parquet(cache_path)
postcodes = frame["postcode"].to_list()
geoms = shapely.from_wkb(frame["wkb"].to_list())
return list(zip(postcodes, geoms))