"""Regression tests for parent-base lifting before hierarchical blending. solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN earliest year, so a cell with a shorter history sits on a later origin than its (wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child must first be lifted onto its parent's base at the child's first year, or the blend averages level-incompatible numbers (fix5-index-base-year). Note: re-anchoring each cell to the *global* base year is a no-op on real data (a cell anchored to 0 at its own earliest year already reads 0 there, and the global base is never later), which is why the fix lifts onto the *parent* at the child's own start year instead. Also covers winsorize_steps, the post-smoothing per-year step clamp against the national index (fix: violent single-year index jumps in thin early years). """ from pipeline.transform.price_estimation.shrinkage import ( MAX_STEP_DEVIATION_PER_YEAR, hierarchical_shrinkage, lift_onto_parent, shrink_dicts, winsorize_steps, ) from pipeline.transform.price_estimation.utils import SHRINKAGE_K def test_lift_rebases_late_starting_child_onto_parent(): """A child anchored at its own later start year is lifted to the parent's level there.""" parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50} # Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0. sector = {2016: 0.0, 2024: 0.20} lifted = lift_onto_parent(sector, parent) # child[start] now equals the parent's accumulated level at that year. assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20 assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40 # Pure constant shift: the child's own year-to-year move is preserved. assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12 def test_lift_is_noop_when_child_starts_at_parent_base(): """A child whose earliest year is the parent's base (value 0) is unchanged.""" parent = {1996: 0.0, 2008: 0.80, 2016: 1.20} child = {1996: 0.0, 2008: 0.75, 2016: 1.10} assert lift_onto_parent(child, parent) == child def test_lift_handles_empty_inputs(): assert lift_onto_parent({}, {2000: 0.0}) == {} assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0} def test_lift_fixes_estimate_spanning_child_start_but_not_within_range(): """The lift corrects comparisons that span the cell's start year, and ONLY those. A property sold in 2008 (before the sector's own data begins in 2016) and valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level with 1996-based parent levels and badly understates the move. Comparisons wholly inside the sector's own range (2016->2024) are unchanged, because the lift is a pure constant shift that cancels in a within-cell difference. """ parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50} sector = {2016: 0.0, 2024: 0.20} # own data starts 2016 n = 30 w = n / (n + SHRINKAGE_K) raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n) # Within the sector's own range the lift changes nothing. assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12 # 2008 is parent-only in both (sector absent), so both read parent[2008]. assert abs(raw[2008] - parent[2008]) < 1e-12 assert abs(fixed[2008] - parent[2008]) < 1e-12 raw_move = raw[2024] - raw[2008] fixed_move = fixed[2024] - fixed[2008] # Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50. assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12 assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12 # The fix raises the spanning move by exactly the parent growth to the # sector's start year that the raw blend dropped (weighted by w). assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12 # Fixed move is close to the true area-level move (0.70); raw badly understates it. assert abs(fixed_move - 0.70) < 0.2 assert raw_move < 0.4 * fixed_move def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons(): """Integration: passing lift_fn re-bases a late-starting sector via its parent chain.""" top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50} sector = {"AB1 1": {2016: 0.0, 2024: 0.20}} sector_n = {"AB1 1": 300} # No own area/district indices -> the sector shrinks straight toward `top`. base_args = ( sector, sector_n, {}, {}, {}, {}, top, ["AB1 1"], {"AB1 1": "AB1"}, {"AB1": "AB"}, shrink_dicts, ) without_lift = hierarchical_shrinkage(*base_args)["AB1 1"] with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"] # Within the sector's own range: identical (pure constant shift cancels). assert abs( (with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016]) ) < 1e-12 # Spanning the sector's start year: the lift raises the 2008->2024 move. assert (with_lift[2024] - with_lift[2008]) > ( without_lift[2024] - without_lift[2008] ) + 0.1 def test_winsorize_clamps_thin_year_spike_and_shifts_later_years(): """A "M3 1"-style single-year spike (x9.7, log +2.27) is clamped to parent_rate + max_dev; the first year's level is preserved, and later years keep their OWN steps (the tail shifts down rigidly by whatever the clamped step removed).""" child = {1995: 0.0, 1998: 0.2, 1999: 2.47, 2000: 2.5} parent = {y: 0.1 * (y - 1995) for y in range(1995, 2001)} # flat-ish 0.1/yr out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR) assert out[1995] == child[1995] # first year preserved # 1995->1998: 0.0667/yr, well within 0.1 +/- 0.40 -> untouched. assert abs(out[1998] - child[1998]) < 1e-12 # 1998->1999: 2.27/yr clamped to parent_rate + max_dev = 0.1 + 0.40. assert abs((out[1999] - out[1998]) - (0.1 + MAX_STEP_DEVIATION_PER_YEAR)) < 1e-12 # 1999->2000: the in-band +0.03 step survives; the level shifts down with # the clamped 1999. assert abs((out[2000] - out[1999]) - (child[2000] - child[1999])) < 1e-12 assert abs(out[2000] - 0.73) < 1e-12 def test_winsorize_preserves_genuine_moves(): """Steps within parent_rate +/- max_dev pass through (numerically) unchanged.""" child = {2000: 0.0, 2001: 0.35, 2002: 0.40, 2003: 0.20} parent = {y: 0.05 * (y - 2000) for y in range(2000, 2004)} out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR) assert set(out) == set(child) assert max(abs(out[y] - child[y]) for y in child) < 1e-12 def test_winsorize_judges_gap_steps_on_per_year_rate(): """A step across a multi-year gap is judged on its PER-YEAR rate (with gap-tolerant parent lookup via _base_value), not as one giant single-year move: +1.0 over 5 years (0.2/yr) is in-band even though +1.0 in one year would be clamped.""" child = {1995: 0.0, 2000: 1.0} # Parent lacks both endpoint years: 1995 back-fills to its earliest value # (0.0), 2000 forward-fills from 1999 (0.3) -> parent rate 0.06/yr. parent = {1996: 0.0, 1999: 0.3} out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR) assert out == child def test_winsorize_degenerate_inputs_unchanged(): """<2 child years -> no steps to clamp; an empty parent only occurs in degenerate paths (build_index always passes the national index) -> child is returned unchanged, never clamped against an arbitrary rate.""" assert winsorize_steps({}, {2000: 0.0, 2001: 0.1}, 0.4) == {} assert winsorize_steps({2000: 0.5}, {2000: 0.0, 2001: 0.1}, 0.4) == {2000: 0.5} spiky = {2000: 0.0, 2001: 5.0} assert winsorize_steps(spiky, {}, 0.4) == spiky