This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -19,8 +19,7 @@ from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
reanchor_dict,
reanchor_dicts,
lift_onto_parent,
shrink_dicts,
spatial_smooth,
)
@ -169,33 +168,47 @@ def solve_robust_index(
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
# This damps single-year index spikes without flattening genuine trends.
# Betas are ordered by calendar year; the baseline year (min_year, implicit
# beta=0) has no column, so the penalty spans the non-baseline years only.
# For cells with <3 betas there is no curvature to penalise and the solve is
# unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
years_sorted = sorted(year_to_col)
cols_by_year = [year_to_col[y] for y in years_sorted]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
pen_vals[3 * k : 3 * k + 3] = (
sqrt_lambda * w0,
sqrt_lambda * w1,
sqrt_lambda * w2,
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_vals_arr = pen_vals
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
# Count only information-bearing pairs: same-year (year1==year2) and
# baseline-baseline pairs cancel in the sparse solve and contribute
# zero information to the annual index, so including them would
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
n_pairs[key] = int(np.count_nonzero(y2 != y1))
return indices, n_pairs
@ -433,20 +450,17 @@ def build_index(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Re-anchor every repeat-sales dict to the global base year before any
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
# log-index 0 at its OWN earliest year, so cells with shorter histories
# are measured from a later origin; combining them key-by-key would
# otherwise average level-incompatible numbers. The hedonic fallback is
# already anchored at min_year, so we align everything to min_year.
national_idx = reanchor_dict(national_idx, min_year)
area_idx = reanchor_dicts(area_idx, min_year)
district_idx = reanchor_dicts(district_idx, min_year)
sector_idx = reanchor_dicts(sector_idx, min_year)
# Shrinkage: national -> hedonic first, then hierarchical
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
# so cells with shorter histories sit on a later origin than their wider
# parents. Before each blend we lift the child onto its parent's base at
# the child's first year (lift_onto_parent) -- otherwise combining them
# key-by-key averages level-incompatible numbers. The hedonic fallback is
# anchored at the global min_year, so it serves as the base for national.
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
national_shrunk = shrink_dicts(
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
)
sector_shrunk = hierarchical_shrinkage(
sector_idx,
sector_n,
@ -459,6 +473,7 @@ def build_index(
sector_to_dist,
dist_to_area,
shrink_dicts,
lift_onto_parent,
)
# Spatial smoothing

View file

@ -142,6 +142,20 @@ def _sale_identity_matches(
target_price: float,
target_sale_date: int,
) -> np.ndarray:
"""Mark pool comparables that are (almost certainly) the target's own sale.
properties.parquet has no per-property id, so a sale is identified by the
proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
prior sale out of its comparable set (leakage prevention).
Limitation: new-build / bulk blocks sell many DISTINCT properties in one
postcode on the same day at the same price, so all such siblings collide on
this proxy and are excluded together. This is intentional conservative
over-exclusion: it guarantees no leakage at the cost of occasionally
dropping legitimate same-(postcode, price, date) siblings. The effect is
bounded (~1.8% of the pool) and a precise fix would require a per-property
id that the data does not carry.
"""
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
return np.zeros(len(pool_postcodes), dtype=bool)
return (
@ -166,6 +180,16 @@ def knn_median_psm(
PSM is at the reference date used when building the pool.
NaN where not computable (missing coords, unknown type, too few neighbors).
Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
postcode), so every property within a postcode is co-located. For a dense
postcode the "k nearest" therefore degenerates into an arbitrary
same-postcode subset whose membership is decided by KDTree index order
rather than true proximity. No property-level coordinates exist to fix this,
so the kNN signal is treated as a weak, noisy prior: the downstream guarded
blend (guarded_blend_estimates) only blends kNN when it is close to the
index estimate and otherwise discards it, bounding the impact of this
degeneracy. The result is deterministic for a fixed pool order.
"""
n = len(lat)
result = np.full(n, np.nan)

View file

@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
return index[prior[-1]]
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
def lift_onto_parent(
child: dict[int, float], parent: dict[int, float]
) -> dict[int, float]:
"""Lift a child index onto its parent's base before blending the two.
Subtracting the same constant from every year preserves all within-dict
year-to-year differences, so estimate.py's (current - sale) semantics are
unchanged; it only fixes the cross-dict level mismatch before blending.
solve_robust_index anchors every cell to log-index 0 at its OWN earliest
year, so a cell with a shorter history sits on a later origin than its
(wider) parent. Combining them key-by-key would average level-incompatible
numbers (a sector measured from 2008 blended with a district measured from
1996). We add the parent's accumulated level at the child's first year, so
``child[start] == parent[start]``: the child's own year-to-year moves are
layered on top of the parent's growth up to that point -- the same
assumption shrinkage already makes for years the child lacks.
Re-basing on each cell's OWN earliest year (rather than the global base,
which the child cannot observe) is what makes this effective: subtracting
the child's value at the global base is always 0 and changes nothing.
The shift is a single constant added to every year of the child, so the
child's own year-to-year differences are preserved. PRECONDITION for the
downstream estimate to be unaffected within the child's range: the parent's
year coverage must be a superset of the child's. This holds throughout
build_index, where each parent aggregates a superset of its children's sale
pairs, so shrink_dicts blends every child year against a present parent year
and the constant shift cancels in a within-range (current - sale) difference;
only comparisons that span the child's start year (e.g. a sale predating the
cell's own data) change. If a caller violates the precondition (a child year
the parent lacks), shrink_dicts passes that year through unshrunk and the
cancellation no longer holds.
"""
if not index:
return index
shift = _base_value(index, base_year)
if shift == 0.0:
return index
return {y: v - shift for y, v in index.items()}
def reanchor_dicts(
indices: dict[str, dict[int, float]], base_year: int
) -> dict[str, dict[int, float]]:
"""Re-anchor every index dict in a mapping to the common `base_year`."""
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
if not child or not parent:
return child
child_start = min(child)
offset = _base_value(parent, child_start) - child[child_start]
if offset == 0.0:
return child
return {y: v + offset for y, v in child.items()}
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
sector_to_dist: dict[str, str],
dist_to_area: dict[str, str],
shrink_fn: Callable[[V, V, int], V],
lift_fn: Callable[[V, V], V] | None = None,
) -> dict[str, V]:
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
`lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
(see lift_onto_parent); pass None for category-keyed dicts where re-basing is
meaningless.
"""
def combine(raw: V, parent: V, n: int) -> V:
if lift_fn is not None:
raw = lift_fn(raw, parent)
return shrink_fn(raw, parent, n)
# Area -> top level
area_shrunk = {}
for area, val in area_vals.items():
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
area_shrunk[area] = combine(val, top_level, area_n[area])
# District -> area
district_shrunk = {}
for dist, val in district_vals.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, top_level)
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
district_shrunk[dist] = combine(val, parent, district_n[dist])
# Sector -> district
sector_shrunk = {}
for sec, val in sector_vals.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, top_level)
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
sector_shrunk[sec] = combine(val, parent, sector_n[sec])
# Fill sectors without their own values
for sec in all_sectors:

View file

@ -0,0 +1,135 @@
import numpy as np
import polars as pl
from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
compute_indices_for_level,
solve_robust_index,
)
def _pairs_from_path(true_levels: dict[int, float]):
"""Build adjacent-year repeat-sale pairs that exactly trace a known path.
Each consecutive pair's log_ratio is the difference of the true log-levels,
so the solver should recover the levels exactly (relative to the min year).
"""
years = sorted(true_levels)
y1, y2, lr, w = [], [], [], []
for a, b in zip(years[:-1], years[1:]):
y1.append(a)
y2.append(b)
lr.append(true_levels[b] - true_levels[a])
w.append(1.0)
return (
np.array(y1, dtype=np.int32),
np.array(y2, dtype=np.int32),
np.array(lr, dtype=np.float64),
np.array(w, dtype=np.float64),
)
def test_solver_recovers_contiguous_path():
"""A contiguous price path is recovered as log-levels relative to min_year.
Proves the IRLS solver is correct (and unchanged) for contiguous data: the
spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
"""
years = range(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years} # smooth (zero curvature) ramp
# Replicate each adjacent pair so MIN_PAIRS is comfortably met.
y1, y2, lr, w = _pairs_from_path(true)
y1 = np.tile(y1, 3)
y2 = np.tile(y2, 3)
lr = np.tile(lr, 3)
w = np.tile(w, 3)
idx = solve_robust_index(y1, y2, lr, w)
assert idx[2010] == 0.0 # baseline anchor
for y in years:
assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
"""FIX #5: a sharp true level jump across a multi-year gap is preserved.
Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
(beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
spacing-aware second difference relaxes the penalty across the gap.
"""
# True log-levels relative to min_year (2000 anchored at 0).
true = {
2000: 0.0,
2001: 0.05,
2002: 0.10,
2015: 1.10, # sharp +1.0 jump across the gap
2016: 1.15,
}
y1, y2, lr, w = [], [], [], []
def add(a, b, n=4):
for _ in range(n):
y1.append(a)
y2.append(b)
lr.append(true[b] - true[a])
w.append(1.0)
# In-segment adjacent pairs.
add(2000, 2001)
add(2001, 2002)
add(2015, 2016)
# Cross-gap pairs consistent with the sharp jump.
add(2002, 2015)
add(2002, 2016)
y1 = np.array(y1, dtype=np.int32)
y2 = np.array(y2, dtype=np.int32)
lr = np.array(lr, dtype=np.float64)
w = np.array(w, dtype=np.float64)
# Use a strong penalty to make the smoothing bias obvious.
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
try:
idx = solve_robust_index(y1, y2, lr, w)
finally:
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
assert idx[2000] == 0.0 # baseline anchor
# beta_2015 must stay near its true post-gap level, not get dragged down by a
# spurious curvature penalty that treats the gap as a single-year step.
assert abs(idx[2015] - true[2015]) < 0.05
def test_n_pairs_counts_only_cross_year_pairs():
"""FIX #12: same-year pairs carry zero index information and must not inflate
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
rows = []
def add_pairs(group, year1, year2, n):
for _ in range(n):
rows.append(
{
"grp": group,
"year1": year1,
"year2": year2,
"log_ratio": 0.03 * (year2 - year1),
"weight": 1.0,
}
)
# 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
# zero-information same-year pairs that must not be counted.
add_pairs("g", 2010, 2011, 4)
add_pairs("g", 2011, 2012, 4)
add_pairs("g", 2012, 2012, 3) # same-year, zero info
pairs = pl.DataFrame(rows)
indices, n_pairs = compute_indices_for_level(pairs, "grp")
assert "g" in indices
assert n_pairs["g"] == 8 # not 11

View file

@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
),
)
# The five 900k same-postcode siblings share the target's (postcode, price,
# date) identity proxy, so they are all excluded as comparables, leaving the
# 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
# INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
# exists to distinguish a target's own resale from a distinct bulk-block
# sibling sold same-day at the same price), not ideal behaviour -- see the
# _sale_identity_matches docstring.
assert psm[0] == 2_500.0
def test_knn_median_psm_is_deterministic():
"""Reproducibility guard (BUG #6): within-postcode neighbours are co-located
(one centroid per postcode), so the kNN result for dense postcodes depends on
an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
two identical calls against the same trees/inputs return identical output, so
future refactors cannot silently introduce run-to-run nondeterminism."""
sale_date = date(2026, 1, 1)
rows = [
{
"Postcode": "AA1 1AA",
"Property type": "Detached",
"lat": 51.5000 + i * 0.00001,
"lon": -0.1000,
"Total floor area (sqm)": 80.0,
"Last known price": 200_000.0 + i * 1_000.0,
"Date of last transaction": sale_date,
}
for i in range(40)
]
df = pl.DataFrame(rows)
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
args = dict(
lat=np.array([51.5000, 51.5002]),
lon=np.array([-0.1000, -0.1000]),
type_groups=np.array(["Detached", "Detached"]),
)
first = knn_median_psm(trees, **args)
second = knn_median_psm(trees, **args)
assert np.array_equal(first, second)
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
blended = guarded_blend_estimates(
index_est=np.array([120_000.0, 1_000_000.0]),

View file

@ -1,99 +1,117 @@
"""Regression tests for common-base-year re-anchoring before blending.
"""Regression tests for parent-base lifting before hierarchical blending.
Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
different base years must be re-anchored to a single common base first, or the
solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
earliest year, so a cell with a shorter history sits on a later origin than its
(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
must first be lifted onto its parent's base at the child's first year, or the
blend averages level-incompatible numbers (fix5-index-base-year).
Note: re-anchoring each cell to the *global* base year is a no-op on real data
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
global base is never later), which is why the fix lifts onto the *parent* at the
child's own start year instead.
"""
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
reanchor_dict,
reanchor_dicts,
hierarchical_shrinkage,
lift_onto_parent,
shrink_dicts,
)
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
def test_reanchor_is_pure_constant_shift_preserving_differences():
"""Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
# Anchored at its own earliest year 2008.
idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
def test_lift_rebases_late_starting_child_onto_parent():
"""A child anchored at its own later start year is lifted to the parent's level there."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
# Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
sector = {2016: 0.0, 2024: 0.20}
reanchored = reanchor_dict(idx, 1996)
# 1996 is before this dict's history -> back-fill earliest value (0.0),
# so the shift is 0 and the dict is unchanged.
assert reanchored[2008] == 0.0
lifted = lift_onto_parent(sector, parent)
# Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
reanchored_2010 = reanchor_dict(idx, 2010)
assert reanchored_2010[2010] == 0.0
# All within-dict differences are preserved under the constant shift.
years = sorted(idx)
for a, b in zip(years, years[1:]):
assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
# child[start] now equals the parent's accumulated level at that year.
assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20
assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40
# Pure constant shift: the child's own year-to-year move is preserved.
assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12
def test_blend_different_base_years_needs_reanchoring():
"""Blending two dicts on different bases is biased unless re-anchored first.
def test_lift_is_noop_when_child_starts_at_parent_base():
"""A child whose earliest year is the parent's base (value 0) is unchanged."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
assert lift_onto_parent(child, parent) == child
Both cells observe the common base year 1996 but were anchored to DIFFERENT
origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
cells whose pair history starts at different years). They describe the SAME
true trajectory measured from 1996, so a 50/50 blend should reproduce that
common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
sectorA's 1996-relative numbers, level-shifting the smoothed result.
def test_lift_handles_empty_inputs():
assert lift_onto_parent({}, {2000: 0.0}) == {}
assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
"""The lift corrects comparisons that span the cell's start year, and ONLY those.
A property sold in 2008 (before the sector's own data begins in 2016) and
valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
with 1996-based parent levels and badly understates the move. Comparisons
wholly inside the sector's own range (2016->2024) are unchanged, because the
lift is a pure constant shift that cancels in a within-cell difference.
"""
base_year = 1996
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {2016: 0.0, 2024: 0.20} # own data starts 2016
n = 30
w = n / (n + SHRINKAGE_K)
# True log-levels relative to 1996 (identical trajectory for both cells).
truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting
fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)
# sectorA: anchored at 1996 (its earliest year) -> equals truth.
sector_a = dict(truth)
# sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
# every year), exactly how solve_robust_index would express a cell whose
# earliest year happened to be picked as 2008.
shift_b = truth[2008]
sector_b = {y: v - shift_b for y, v in truth.items()}
# Within the sector's own range the lift changes nothing.
assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12
# --- Pre-fix behaviour: blend the raw dicts directly. ---
raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
# Every year is pulled by half of shift_b (0.4) away from the truth.
assert abs(raw_blend[2012] - truth[2012]) > 0.3
assert abs(raw_blend[1996] - truth[1996]) > 0.3
# 2008 is parent-only in both (sector absent), so both read parent[2008].
assert abs(raw[2008] - parent[2008]) < 1e-12
assert abs(fixed[2008] - parent[2008]) < 1e-12
# --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
# Both cells now read 0 at 1996 and the true level at every shared year.
for y in truth:
assert abs(fixed_blend[y] - truth[y]) < 1e-9
raw_move = raw[2024] - raw[2008]
fixed_move = fixed[2024] - fixed[2008]
# Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
# The fix raises the spanning move by exactly the parent growth to the
# sector's start year that the raw blend dropped (weighted by w).
assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
# Fixed move is close to the true area-level move (0.70); raw badly understates it.
assert abs(fixed_move - 0.70) < 0.2
assert raw_move < 0.4 * fixed_move
def test_shrink_dicts_after_reanchoring_is_consistent():
"""Shrinking a cell toward its parent must use a common origin."""
base_year = 2000
# Parent (national) anchored at 2000.
parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
# Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
# every year), as solve_robust_index would express a cell whose earliest year
# is later. It still observes the 2000 base year (value -0.50).
sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
n = 0 # no own data weight -> result should equal parent after anchoring
def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
"""Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
sector_n = {"AB1 1": 300}
# No own area/district indices -> the sector shrinks straight toward `top`.
base_args = (
sector,
sector_n,
{},
{},
{},
{},
top,
["AB1 1"],
{"AB1 1": "AB1"},
{"AB1": "AB"},
shrink_dicts,
)
reanchored_sector = reanchor_dict(sector, base_year)
# Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
# origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
shrunk = shrink_dicts(reanchored_sector, parent, n)
assert abs(shrunk[2000] - 0.0) < 1e-9
assert abs(shrunk[2010] - 0.50) < 1e-9
assert abs(shrunk[2020] - 1.20) < 1e-9
without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]
def test_reanchor_exact_hit_shifts_all_years():
"""When the base year is present, subtract its value from every year."""
idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
reanchored = reanchor_dict(idx, 2005)
assert reanchored[2005] == 0.0
assert abs(reanchored[1996] - (-0.30)) < 1e-12
assert abs(reanchored[2015] - 0.60) < 1e-12
# Within the sector's own range: identical (pure constant shift cancels).
assert abs(
(with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
) < 1e-12
# Spanning the sector's start year: the lift raises the 2008->2024 move.
assert (with_lift[2024] - with_lift[2008]) > (
without_lift[2024] - without_lift[2008]
) + 0.1