idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -19,8 +19,7 @@ from tqdm import tqdm
 from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
-    reanchor_dict,
-    reanchor_dicts,
+    lift_onto_parent,
    shrink_dicts,
    spatial_smooth,
 )
@ -169,33 +168,47 @@ def solve_robust_index(
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

    # Temporal smoothness prior: penalise curvature in the year betas with a
-    # second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
-    # encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
-    # zero target). This damps single-year index spikes without flattening
-    # genuine multi-year trends. Betas are ordered by calendar year; the baseline
-    # year (min_year, implicit beta=0) has no column, so the penalty spans the
-    # non-baseline years only. For cells with <3 betas there is no curvature to
-    # penalise and the solve is unchanged.
+    # second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
+    # least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
+    # The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
+    # for the consecutive triple (y0, y1, y2), so gap years are not treated as
+    # adjacent: a multi-year gap relaxes the penalty (correctly preserving a
+    # genuine level jump) instead of forcing a smooth ramp. For unit spacing
+    # (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
+    # This damps single-year index spikes without flattening genuine trends.
+    # Betas are ordered by calendar year; the baseline year (min_year, implicit
+    # beta=0) has no column, so the penalty spans the non-baseline years only.
+    # For cells with <3 betas there is no curvature to penalise and the solve is
+    # unchanged.
    n_pen = 0
    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
-        cols_by_year = [c for _, c in sorted(year_to_col.items())]
+        years_sorted = sorted(year_to_col)
+        cols_by_year = [year_to_col[y] for y in years_sorted]
        n_pen = n_cols - 2
        pen_rows = np.repeat(n + np.arange(n_pen), 3)
        pen_cols = np.empty(n_pen * 3, dtype=np.int64)
+        pen_vals = np.empty(n_pen * 3, dtype=np.float64)
        for k in range(n_pen):
            pen_cols[3 * k : 3 * k + 3] = (
                cols_by_year[k],
                cols_by_year[k + 1],
                cols_by_year[k + 2],
            )
+            y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
+            w0 = 2.0 / ((y1 - y0) * (y2 - y0))
+            w1 = -2.0 / ((y1 - y0) * (y2 - y1))
+            w2 = 2.0 / ((y2 - y1) * (y2 - y0))
+            pen_vals[3 * k : 3 * k + 3] = (
+                sqrt_lambda * w0,
+                sqrt_lambda * w1,
+                sqrt_lambda * w2,
+            )
        pen_rows_arr = pen_rows.astype(np.int64)
        pen_cols_arr = pen_cols
-        pen_vals_arr = np.tile(
-            [sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
-        ).astype(np.float64)
+        pen_vals_arr = pen_vals
        pen_b = np.zeros(n_pen, dtype=np.float64)
    n_total_rows = n + n_pen

@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
        idx = solve_robust_index(y1, y2, lr, w)
        if idx:
            indices[key] = idx
-            n_pairs[key] = len(y1)
+            # Count only information-bearing pairs: same-year (year1==year2) and
+            # baseline-baseline pairs cancel in the sparse solve and contribute
+            # zero information to the annual index, so including them would
+            # inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
+            n_pairs[key] = int(np.count_nonzero(y2 != y1))
    return indices, n_pairs


@ -433,20 +450,17 @@ def build_index(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

-        # Re-anchor every repeat-sales dict to the global base year before any
-        # shrinkage/smoothing/blending. solve_robust_index anchors each cell to
-        # log-index 0 at its OWN earliest year, so cells with shorter histories
-        # are measured from a later origin; combining them key-by-key would
-        # otherwise average level-incompatible numbers. The hedonic fallback is
-        # already anchored at min_year, so we align everything to min_year.
-        national_idx = reanchor_dict(national_idx, min_year)
-        area_idx = reanchor_dicts(area_idx, min_year)
-        district_idx = reanchor_dicts(district_idx, min_year)
-        sector_idx = reanchor_dicts(sector_idx, min_year)
-
-        # Shrinkage: national -> hedonic first, then hierarchical
+        # Shrinkage: national -> hedonic first, then hierarchical. Each cell is
+        # anchored to log-index 0 at its OWN earliest year (solve_robust_index),
+        # so cells with shorter histories sit on a later origin than their wider
+        # parents. Before each blend we lift the child onto its parent's base at
+        # the child's first year (lift_onto_parent) -- otherwise combining them
+        # key-by-key averages level-incompatible numbers. The hedonic fallback is
+        # anchored at the global min_year, so it serves as the base for national.
        print("  Applying shrinkage...")
-        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
+        national_shrunk = shrink_dicts(
+            lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
+        )
        sector_shrunk = hierarchical_shrinkage(
            sector_idx,
            sector_n,
@ -459,6 +473,7 @@ def build_index(
            sector_to_dist,
            dist_to_area,
            shrink_dicts,
+            lift_onto_parent,
        )

        # Spatial smoothing
--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -142,6 +142,20 @@ def _sale_identity_matches(
    target_price: float,
    target_sale_date: int,
 ) -> np.ndarray:
+    """Mark pool comparables that are (almost certainly) the target's own sale.
+
+    properties.parquet has no per-property id, so a sale is identified by the
+    proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
+    prior sale out of its comparable set (leakage prevention).
+
+    Limitation: new-build / bulk blocks sell many DISTINCT properties in one
+    postcode on the same day at the same price, so all such siblings collide on
+    this proxy and are excluded together. This is intentional conservative
+    over-exclusion: it guarantees no leakage at the cost of occasionally
+    dropping legitimate same-(postcode, price, date) siblings. The effect is
+    bounded (~1.8% of the pool) and a precise fix would require a per-property
+    id that the data does not carry.
+    """
    if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
        return np.zeros(len(pool_postcodes), dtype=bool)
    return (
@ -166,6 +180,16 @@ def knn_median_psm(

    PSM is at the reference date used when building the pool.
    NaN where not computable (missing coords, unknown type, too few neighbors).
+
+    Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
+    postcode), so every property within a postcode is co-located. For a dense
+    postcode the "k nearest" therefore degenerates into an arbitrary
+    same-postcode subset whose membership is decided by KDTree index order
+    rather than true proximity. No property-level coordinates exist to fix this,
+    so the kNN signal is treated as a weak, noisy prior: the downstream guarded
+    blend (guarded_blend_estimates) only blends kNN when it is close to the
+    index estimate and otherwise discards it, bounding the impact of this
+    degeneracy. The result is deterministic for a fixed pool order.
    """
    n = len(lat)
    result = np.full(n, np.nan)
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
    return index[prior[-1]]


-def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
-    """Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
+def lift_onto_parent(
+    child: dict[int, float], parent: dict[int, float]
+) -> dict[int, float]:
+    """Lift a child index onto its parent's base before blending the two.

-    Subtracting the same constant from every year preserves all within-dict
-    year-to-year differences, so estimate.py's (current - sale) semantics are
-    unchanged; it only fixes the cross-dict level mismatch before blending.
+    solve_robust_index anchors every cell to log-index 0 at its OWN earliest
+    year, so a cell with a shorter history sits on a later origin than its
+    (wider) parent. Combining them key-by-key would average level-incompatible
+    numbers (a sector measured from 2008 blended with a district measured from
+    1996). We add the parent's accumulated level at the child's first year, so
+    ``child[start] == parent[start]``: the child's own year-to-year moves are
+    layered on top of the parent's growth up to that point -- the same
+    assumption shrinkage already makes for years the child lacks.
+
+    Re-basing on each cell's OWN earliest year (rather than the global base,
+    which the child cannot observe) is what makes this effective: subtracting
+    the child's value at the global base is always 0 and changes nothing.
+
+    The shift is a single constant added to every year of the child, so the
+    child's own year-to-year differences are preserved. PRECONDITION for the
+    downstream estimate to be unaffected within the child's range: the parent's
+    year coverage must be a superset of the child's. This holds throughout
+    build_index, where each parent aggregates a superset of its children's sale
+    pairs, so shrink_dicts blends every child year against a present parent year
+    and the constant shift cancels in a within-range (current - sale) difference;
+    only comparisons that span the child's start year (e.g. a sale predating the
+    cell's own data) change. If a caller violates the precondition (a child year
+    the parent lacks), shrink_dicts passes that year through unshrunk and the
+    cancellation no longer holds.
    """
-    if not index:
-        return index
-    shift = _base_value(index, base_year)
-    if shift == 0.0:
-        return index
-    return {y: v - shift for y, v in index.items()}
-
-
-def reanchor_dicts(
-    indices: dict[str, dict[int, float]], base_year: int
-) -> dict[str, dict[int, float]]:
-    """Re-anchor every index dict in a mapping to the common `base_year`."""
-    return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
+    if not child or not parent:
+        return child
+    child_start = min(child)
+    offset = _base_value(parent, child_start) - child[child_start]
+    if offset == 0.0:
+        return child
+    return {y: v + offset for y, v in child.items()}


 def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
    sector_to_dist: dict[str, str],
    dist_to_area: dict[str, str],
    shrink_fn: Callable[[V, V, int], V],
+    lift_fn: Callable[[V, V], V] | None = None,
 ) -> dict[str, V]:
    """Top-down hierarchical shrinkage: area->top, district->area, sector->district.

    `top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
    or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
+    `lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
+    (see lift_onto_parent); pass None for category-keyed dicts where re-basing is
+    meaningless.
    """
+
+    def combine(raw: V, parent: V, n: int) -> V:
+        if lift_fn is not None:
+            raw = lift_fn(raw, parent)
+        return shrink_fn(raw, parent, n)
+
    # Area -> top level
    area_shrunk = {}
    for area, val in area_vals.items():
-        area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
+        area_shrunk[area] = combine(val, top_level, area_n[area])

    # District -> area
    district_shrunk = {}
    for dist, val in district_vals.items():
        a = dist_to_area.get(dist, "")
        parent = area_shrunk.get(a, top_level)
-        district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
+        district_shrunk[dist] = combine(val, parent, district_n[dist])

    # Sector -> district
    sector_shrunk = {}
    for sec, val in sector_vals.items():
        d = sector_to_dist.get(sec, "")
        parent = district_shrunk.get(d, top_level)
-        sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
+        sector_shrunk[sec] = combine(val, parent, sector_n[sec])

    # Fill sectors without their own values
    for sec in all_sectors:
--- a/pipeline/transform/price_estimation/test_index.py
+++ b/pipeline/transform/price_estimation/test_index.py
@ -0,0 +1,135 @@
+import numpy as np
+import polars as pl
+
+from pipeline.transform.price_estimation import index as index_mod
+from pipeline.transform.price_estimation.index import (
+    compute_indices_for_level,
+    solve_robust_index,
+)
+
+
+def _pairs_from_path(true_levels: dict[int, float]):
+    """Build adjacent-year repeat-sale pairs that exactly trace a known path.
+
+    Each consecutive pair's log_ratio is the difference of the true log-levels,
+    so the solver should recover the levels exactly (relative to the min year).
+    """
+    years = sorted(true_levels)
+    y1, y2, lr, w = [], [], [], []
+    for a, b in zip(years[:-1], years[1:]):
+        y1.append(a)
+        y2.append(b)
+        lr.append(true_levels[b] - true_levels[a])
+        w.append(1.0)
+    return (
+        np.array(y1, dtype=np.int32),
+        np.array(y2, dtype=np.int32),
+        np.array(lr, dtype=np.float64),
+        np.array(w, dtype=np.float64),
+    )
+
+
+def test_solver_recovers_contiguous_path():
+    """A contiguous price path is recovered as log-levels relative to min_year.
+
+    Proves the IRLS solver is correct (and unchanged) for contiguous data: the
+    spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
+    """
+    years = range(2010, 2021)
+    true = {y: 0.04 * (y - 2010) for y in years}  # smooth (zero curvature) ramp
+    # Replicate each adjacent pair so MIN_PAIRS is comfortably met.
+    y1, y2, lr, w = _pairs_from_path(true)
+    y1 = np.tile(y1, 3)
+    y2 = np.tile(y2, 3)
+    lr = np.tile(lr, 3)
+    w = np.tile(w, 3)
+
+    idx = solve_robust_index(y1, y2, lr, w)
+
+    assert idx[2010] == 0.0  # baseline anchor
+    for y in years:
+        assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
+
+
+def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
+    """FIX #5: a sharp true level jump across a multi-year gap is preserved.
+
+    Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
+    sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
+    (beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
+    the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
+    spacing-aware second difference relaxes the penalty across the gap.
+    """
+    # True log-levels relative to min_year (2000 anchored at 0).
+    true = {
+        2000: 0.0,
+        2001: 0.05,
+        2002: 0.10,
+        2015: 1.10,  # sharp +1.0 jump across the gap
+        2016: 1.15,
+    }
+
+    y1, y2, lr, w = [], [], [], []
+
+    def add(a, b, n=4):
+        for _ in range(n):
+            y1.append(a)
+            y2.append(b)
+            lr.append(true[b] - true[a])
+            w.append(1.0)
+
+    # In-segment adjacent pairs.
+    add(2000, 2001)
+    add(2001, 2002)
+    add(2015, 2016)
+    # Cross-gap pairs consistent with the sharp jump.
+    add(2002, 2015)
+    add(2002, 2016)
+
+    y1 = np.array(y1, dtype=np.int32)
+    y2 = np.array(y2, dtype=np.int32)
+    lr = np.array(lr, dtype=np.float64)
+    w = np.array(w, dtype=np.float64)
+
+    # Use a strong penalty to make the smoothing bias obvious.
+    original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
+    index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
+    try:
+        idx = solve_robust_index(y1, y2, lr, w)
+    finally:
+        index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
+
+    assert idx[2000] == 0.0  # baseline anchor
+    # beta_2015 must stay near its true post-gap level, not get dragged down by a
+    # spurious curvature penalty that treats the gap as a single-year step.
+    assert abs(idx[2015] - true[2015]) < 0.05
+
+
+def test_n_pairs_counts_only_cross_year_pairs():
+    """FIX #12: same-year pairs carry zero index information and must not inflate
+    the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
+    rows = []
+
+    def add_pairs(group, year1, year2, n):
+        for _ in range(n):
+            rows.append(
+                {
+                    "grp": group,
+                    "year1": year1,
+                    "year2": year2,
+                    "log_ratio": 0.03 * (year2 - year1),
+                    "weight": 1.0,
+                }
+            )
+
+    # 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
+    # zero-information same-year pairs that must not be counted.
+    add_pairs("g", 2010, 2011, 4)
+    add_pairs("g", 2011, 2012, 4)
+    add_pairs("g", 2012, 2012, 3)  # same-year, zero info
+
+    pairs = pl.DataFrame(rows)
+    indices, n_pairs = compute_indices_for_level(pairs, "grp")
+
+    assert "g" in indices
+    assert n_pairs["g"] == 8  # not 11
--- a/pipeline/transform/price_estimation/test_knn.py
+++ b/pipeline/transform/price_estimation/test_knn.py
@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
        ),
    )

+    # The five 900k same-postcode siblings share the target's (postcode, price,
+    # date) identity proxy, so they are all excluded as comparables, leaving the
+    # 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
+    # INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
+    # exists to distinguish a target's own resale from a distinct bulk-block
+    # sibling sold same-day at the same price), not ideal behaviour -- see the
+    # _sale_identity_matches docstring.
    assert psm[0] == 2_500.0


+def test_knn_median_psm_is_deterministic():
+    """Reproducibility guard (BUG #6): within-postcode neighbours are co-located
+    (one centroid per postcode), so the kNN result for dense postcodes depends on
+    an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
+    two identical calls against the same trees/inputs return identical output, so
+    future refactors cannot silently introduce run-to-run nondeterminism."""
+    sale_date = date(2026, 1, 1)
+    rows = [
+        {
+            "Postcode": "AA1 1AA",
+            "Property type": "Detached",
+            "lat": 51.5000 + i * 0.00001,
+            "lon": -0.1000,
+            "Total floor area (sqm)": 80.0,
+            "Last known price": 200_000.0 + i * 1_000.0,
+            "Date of last transaction": sale_date,
+        }
+        for i in range(40)
+    ]
+    df = pl.DataFrame(rows)
+    trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
+
+    args = dict(
+        lat=np.array([51.5000, 51.5002]),
+        lon=np.array([-0.1000, -0.1000]),
+        type_groups=np.array(["Detached", "Detached"]),
+    )
+    first = knn_median_psm(trees, **args)
+    second = knn_median_psm(trees, **args)
+
+    assert np.array_equal(first, second)
+
+
 def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
    blended = guarded_blend_estimates(
        index_est=np.array([120_000.0, 1_000_000.0]),
--- a/pipeline/transform/price_estimation/test_shrinkage.py
+++ b/pipeline/transform/price_estimation/test_shrinkage.py
@ -1,99 +1,117 @@
-"""Regression tests for common-base-year re-anchoring before blending.
+"""Regression tests for parent-base lifting before hierarchical blending.

-Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
-year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
-different base years must be re-anchored to a single common base first, or the
+solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
+earliest year, so a cell with a shorter history sits on a later origin than its
+(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
+must first be lifted onto its parent's base at the child's first year, or the
 blend averages level-incompatible numbers (fix5-index-base-year).
+
+Note: re-anchoring each cell to the *global* base year is a no-op on real data
+(a cell anchored to 0 at its own earliest year already reads 0 there, and the
+global base is never later), which is why the fix lifts onto the *parent* at the
+child's own start year instead.
 """

 from pipeline.transform.price_estimation.shrinkage import (
-    blend_dicts,
-    reanchor_dict,
-    reanchor_dicts,
+    hierarchical_shrinkage,
+    lift_onto_parent,
    shrink_dicts,
 )
+from pipeline.transform.price_estimation.utils import SHRINKAGE_K


-def test_reanchor_is_pure_constant_shift_preserving_differences():
-    """Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
-    # Anchored at its own earliest year 2008.
-    idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
+def test_lift_rebases_late_starting_child_onto_parent():
+    """A child anchored at its own later start year is lifted to the parent's level there."""
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    # Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
+    sector = {2016: 0.0, 2024: 0.20}

-    reanchored = reanchor_dict(idx, 1996)
-    # 1996 is before this dict's history -> back-fill earliest value (0.0),
-    # so the shift is 0 and the dict is unchanged.
-    assert reanchored[2008] == 0.0
+    lifted = lift_onto_parent(sector, parent)

-    # Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
-    reanchored_2010 = reanchor_dict(idx, 2010)
-    assert reanchored_2010[2010] == 0.0
-    # All within-dict differences are preserved under the constant shift.
-    years = sorted(idx)
-    for a, b in zip(years, years[1:]):
-        assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
+    # child[start] now equals the parent's accumulated level at that year.
+    assert abs(lifted[2016] - parent[2016]) < 1e-12  # 1.20
+    assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12  # 1.40
+    # Pure constant shift: the child's own year-to-year move is preserved.
+    assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12


-def test_blend_different_base_years_needs_reanchoring():
-    """Blending two dicts on different bases is biased unless re-anchored first.
+def test_lift_is_noop_when_child_starts_at_parent_base():
+    """A child whose earliest year is the parent's base (value 0) is unchanged."""
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
+    child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
+    assert lift_onto_parent(child, parent) == child

-    Both cells observe the common base year 1996 but were anchored to DIFFERENT
-    origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
-    cells whose pair history starts at different years). They describe the SAME
-    true trajectory measured from 1996, so a 50/50 blend should reproduce that
-    common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
-    sectorA's 1996-relative numbers, level-shifting the smoothed result.
+
+def test_lift_handles_empty_inputs():
+    assert lift_onto_parent({}, {2000: 0.0}) == {}
+    assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
+
+
+def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
+    """The lift corrects comparisons that span the cell's start year, and ONLY those.
+
+    A property sold in 2008 (before the sector's own data begins in 2016) and
+    valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
+    with 1996-based parent levels and badly understates the move. Comparisons
+    wholly inside the sector's own range (2016->2024) are unchanged, because the
+    lift is a pure constant shift that cancels in a within-cell difference.
    """
-    base_year = 1996
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    sector = {2016: 0.0, 2024: 0.20}  # own data starts 2016
+    n = 30
+    w = n / (n + SHRINKAGE_K)

-    # True log-levels relative to 1996 (identical trajectory for both cells).
-    truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
+    raw = shrink_dicts(sector, parent, n)  # pre-fix: blend without lifting
+    fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)

-    # sectorA: anchored at 1996 (its earliest year) -> equals truth.
-    sector_a = dict(truth)
-    # sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
-    # every year), exactly how solve_robust_index would express a cell whose
-    # earliest year happened to be picked as 2008.
-    shift_b = truth[2008]
-    sector_b = {y: v - shift_b for y, v in truth.items()}
+    # Within the sector's own range the lift changes nothing.
+    assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12

-    # --- Pre-fix behaviour: blend the raw dicts directly. ---
-    raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
-    # Every year is pulled by half of shift_b (0.4) away from the truth.
-    assert abs(raw_blend[2012] - truth[2012]) > 0.3
-    assert abs(raw_blend[1996] - truth[1996]) > 0.3
+    # 2008 is parent-only in both (sector absent), so both read parent[2008].
+    assert abs(raw[2008] - parent[2008]) < 1e-12
+    assert abs(fixed[2008] - parent[2008]) < 1e-12

-    # --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
-    reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
-    fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
-    # Both cells now read 0 at 1996 and the true level at every shared year.
-    for y in truth:
-        assert abs(fixed_blend[y] - truth[y]) < 1e-9
+    raw_move = raw[2024] - raw[2008]
+    fixed_move = fixed[2024] - fixed[2008]
+    # Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
+    assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
+    assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
+    # The fix raises the spanning move by exactly the parent growth to the
+    # sector's start year that the raw blend dropped (weighted by w).
+    assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
+    # Fixed move is close to the true area-level move (0.70); raw badly understates it.
+    assert abs(fixed_move - 0.70) < 0.2
+    assert raw_move < 0.4 * fixed_move


-def test_shrink_dicts_after_reanchoring_is_consistent():
-    """Shrinking a cell toward its parent must use a common origin."""
-    base_year = 2000
-    # Parent (national) anchored at 2000.
-    parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
-    # Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
-    # every year), as solve_robust_index would express a cell whose earliest year
-    # is later. It still observes the 2000 base year (value -0.50).
-    sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
-    n = 0  # no own data weight -> result should equal parent after anchoring
+def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
+    """Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
+    top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
+    sector_n = {"AB1 1": 300}
+    # No own area/district indices -> the sector shrinks straight toward `top`.
+    base_args = (
+        sector,
+        sector_n,
+        {},
+        {},
+        {},
+        {},
+        top,
+        ["AB1 1"],
+        {"AB1 1": "AB1"},
+        {"AB1": "AB"},
+        shrink_dicts,
+    )

-    reanchored_sector = reanchor_dict(sector, base_year)
-    # Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
-    # origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
-    shrunk = shrink_dicts(reanchored_sector, parent, n)
-    assert abs(shrunk[2000] - 0.0) < 1e-9
-    assert abs(shrunk[2010] - 0.50) < 1e-9
-    assert abs(shrunk[2020] - 1.20) < 1e-9
+    without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
+    with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]

-
-def test_reanchor_exact_hit_shifts_all_years():
-    """When the base year is present, subtract its value from every year."""
-    idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
-    reanchored = reanchor_dict(idx, 2005)
-    assert reanchored[2005] == 0.0
-    assert abs(reanchored[1996] - (-0.30)) < 1e-12
-    assert abs(reanchored[2015] - 0.60) < 1e-12
+    # Within the sector's own range: identical (pure constant shift cancels).
+    assert abs(
+        (with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
+    ) < 1e-12
+    # Spanning the sector's start year: the lift raises the 2008->2024 move.
+    assert (with_lift[2024] - with_lift[2008]) > (
+        without_lift[2024] - without_lift[2008]
+    ) + 0.1