This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:22 +01:00
parent d43da9708c
commit fbfebc651c
5 changed files with 295 additions and 0 deletions

View file

@ -0,0 +1,79 @@
"""Persist Phase-3 output (the per-postcode fragments) so a crash in the later
merge/write phases can resume in seconds instead of re-running the ~10-hour OA
loop.
Phase 3 turns OA boundaries + INSPIRE parcels + UPRN points into ~1.8M
``(postcode, geometry)`` fragments held only in memory. Everything after it
(merge, simplify, GeoJSON write) is cheap but failure-prone -- a single
degenerate postcode used to abort the whole run *after* those 10 hours. Caching
the fragments to disk decouples the expensive computation from the fragile
output stage.
Fragments are stored as one parquet file with two columns: ``postcode``
(string) and ``wkb`` (binary Shapely WKB). Writes are atomic (temp file +
``os.replace``) so an interrupted write never leaves a cache that passes the
freshness check. The cache is validated against its upstream inputs by mtime: if
any input is newer than the cache it is treated as stale and ignored, mirroring
make's own freshness logic.
"""
from __future__ import annotations
import os
from pathlib import Path
import numpy as np
import polars as pl
import shapely
from shapely.geometry.base import BaseGeometry
Fragment = tuple[str, BaseGeometry]
def _tmp_path(cache_path: Path) -> Path:
return cache_path.parent / (cache_path.name + ".tmp")
def fragments_cache_is_fresh(
cache_path: Path, inputs: list[Path | None]
) -> bool:
"""True if ``cache_path`` exists and is newer than every input that exists.
A missing input is ignored (it cannot have changed the cache); a ``None``
input is skipped. Any existing input newer than the cache marks it stale.
"""
if not cache_path.exists():
return False
cache_mtime = cache_path.stat().st_mtime
for inp in inputs:
if inp is None:
continue
path = Path(inp)
if path.exists() and path.stat().st_mtime > cache_mtime:
return False
return True
def save_fragments(cache_path: Path, fragments: list[Fragment]) -> None:
"""Atomically write ``(postcode, geometry)`` fragments to a parquet cache."""
postcodes = [pc for pc, _ in fragments]
geoms = np.array([geom for _, geom in fragments], dtype=object)
wkb = shapely.to_wkb(geoms)
frame = pl.DataFrame(
{"postcode": postcodes, "wkb": list(wkb)},
schema={"postcode": pl.Utf8, "wkb": pl.Binary},
)
cache_path.parent.mkdir(parents=True, exist_ok=True)
tmp = _tmp_path(cache_path)
frame.write_parquet(tmp, compression="zstd")
os.replace(tmp, cache_path)
def load_fragments(cache_path: Path) -> list[Fragment]:
"""Read fragments written by :func:`save_fragments` back into memory."""
frame = pl.read_parquet(cache_path)
postcodes = frame["postcode"].to_list()
geoms = shapely.from_wkb(frame["wkb"].to_list())
return list(zip(postcodes, geoms))

View file

@ -0,0 +1,99 @@
"""Regression tests for common-base-year re-anchoring before blending.
Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
different base years must be re-anchored to a single common base first, or the
blend averages level-incompatible numbers (fix5-index-base-year).
"""
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
reanchor_dict,
reanchor_dicts,
shrink_dicts,
)
def test_reanchor_is_pure_constant_shift_preserving_differences():
"""Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
# Anchored at its own earliest year 2008.
idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
reanchored = reanchor_dict(idx, 1996)
# 1996 is before this dict's history -> back-fill earliest value (0.0),
# so the shift is 0 and the dict is unchanged.
assert reanchored[2008] == 0.0
# Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
reanchored_2010 = reanchor_dict(idx, 2010)
assert reanchored_2010[2010] == 0.0
# All within-dict differences are preserved under the constant shift.
years = sorted(idx)
for a, b in zip(years, years[1:]):
assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
def test_blend_different_base_years_needs_reanchoring():
"""Blending two dicts on different bases is biased unless re-anchored first.
Both cells observe the common base year 1996 but were anchored to DIFFERENT
origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
cells whose pair history starts at different years). They describe the SAME
true trajectory measured from 1996, so a 50/50 blend should reproduce that
common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
sectorA's 1996-relative numbers, level-shifting the smoothed result.
"""
base_year = 1996
# True log-levels relative to 1996 (identical trajectory for both cells).
truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
# sectorA: anchored at 1996 (its earliest year) -> equals truth.
sector_a = dict(truth)
# sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
# every year), exactly how solve_robust_index would express a cell whose
# earliest year happened to be picked as 2008.
shift_b = truth[2008]
sector_b = {y: v - shift_b for y, v in truth.items()}
# --- Pre-fix behaviour: blend the raw dicts directly. ---
raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
# Every year is pulled by half of shift_b (0.4) away from the truth.
assert abs(raw_blend[2012] - truth[2012]) > 0.3
assert abs(raw_blend[1996] - truth[1996]) > 0.3
# --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
# Both cells now read 0 at 1996 and the true level at every shared year.
for y in truth:
assert abs(fixed_blend[y] - truth[y]) < 1e-9
def test_shrink_dicts_after_reanchoring_is_consistent():
"""Shrinking a cell toward its parent must use a common origin."""
base_year = 2000
# Parent (national) anchored at 2000.
parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
# Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
# every year), as solve_robust_index would express a cell whose earliest year
# is later. It still observes the 2000 base year (value -0.50).
sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
n = 0 # no own data weight -> result should equal parent after anchoring
reanchored_sector = reanchor_dict(sector, base_year)
# Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
# origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
shrunk = shrink_dicts(reanchored_sector, parent, n)
assert abs(shrunk[2000] - 0.0) < 1e-9
assert abs(shrunk[2010] - 0.50) < 1e-9
assert abs(shrunk[2020] - 1.20) < 1e-9
def test_reanchor_exact_hit_shifts_all_years():
"""When the base year is present, subtract its value from every year."""
idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
reanchored = reanchor_dict(idx, 2005)
assert reanchored[2005] == 0.0
assert abs(reanchored[1996] - (-0.30)) < 1e-12
assert abs(reanchored[2015] - 0.60) < 1e-12