perfect-postcode/pipeline/transform/price_estimation/test_shrinkage.py

179 lines
7.8 KiB
Python

"""Regression tests for parent-base lifting before hierarchical blending.
solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
earliest year, so a cell with a shorter history sits on a later origin than its
(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
must first be lifted onto its parent's base at the child's first year, or the
blend averages level-incompatible numbers (fix5-index-base-year).
Note: re-anchoring each cell to the *global* base year is a no-op on real data
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
global base is never later), which is why the fix lifts onto the *parent* at the
child's own start year instead.
Also covers winsorize_steps, the post-smoothing per-year step clamp against the
national index (fix: violent single-year index jumps in thin early years).
"""
from pipeline.transform.price_estimation.shrinkage import (
MAX_STEP_DEVIATION_PER_YEAR,
hierarchical_shrinkage,
lift_onto_parent,
shrink_dicts,
winsorize_steps,
)
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
def test_lift_rebases_late_starting_child_onto_parent():
"""A child anchored at its own later start year is lifted to the parent's level there."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
# Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
sector = {2016: 0.0, 2024: 0.20}
lifted = lift_onto_parent(sector, parent)
# child[start] now equals the parent's accumulated level at that year.
assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20
assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40
# Pure constant shift: the child's own year-to-year move is preserved.
assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12
def test_lift_is_noop_when_child_starts_at_parent_base():
"""A child whose earliest year is the parent's base (value 0) is unchanged."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
assert lift_onto_parent(child, parent) == child
def test_lift_handles_empty_inputs():
assert lift_onto_parent({}, {2000: 0.0}) == {}
assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
"""The lift corrects comparisons that span the cell's start year, and ONLY those.
A property sold in 2008 (before the sector's own data begins in 2016) and
valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
with 1996-based parent levels and badly understates the move. Comparisons
wholly inside the sector's own range (2016->2024) are unchanged, because the
lift is a pure constant shift that cancels in a within-cell difference.
"""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {2016: 0.0, 2024: 0.20} # own data starts 2016
n = 30
w = n / (n + SHRINKAGE_K)
raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting
fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)
# Within the sector's own range the lift changes nothing.
assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12
# 2008 is parent-only in both (sector absent), so both read parent[2008].
assert abs(raw[2008] - parent[2008]) < 1e-12
assert abs(fixed[2008] - parent[2008]) < 1e-12
raw_move = raw[2024] - raw[2008]
fixed_move = fixed[2024] - fixed[2008]
# Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
# The fix raises the spanning move by exactly the parent growth to the
# sector's start year that the raw blend dropped (weighted by w).
assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
# Fixed move is close to the true area-level move (0.70); raw badly understates it.
assert abs(fixed_move - 0.70) < 0.2
assert raw_move < 0.4 * fixed_move
def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
"""Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
sector_n = {"AB1 1": 300}
# No own area/district indices -> the sector shrinks straight toward `top`.
base_args = (
sector,
sector_n,
{},
{},
{},
{},
top,
["AB1 1"],
{"AB1 1": "AB1"},
{"AB1": "AB"},
shrink_dicts,
)
without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]
# Within the sector's own range: identical (pure constant shift cancels).
assert abs(
(with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
) < 1e-12
# Spanning the sector's start year: the lift raises the 2008->2024 move.
assert (with_lift[2024] - with_lift[2008]) > (
without_lift[2024] - without_lift[2008]
) + 0.1
def test_winsorize_clamps_thin_year_spike_and_shifts_later_years():
"""A "M3 1"-style single-year spike (x9.7, log +2.27) is clamped to
parent_rate + max_dev; the first year's level is preserved, and later
years keep their OWN steps (the tail shifts down rigidly by whatever the
clamped step removed)."""
child = {1995: 0.0, 1998: 0.2, 1999: 2.47, 2000: 2.5}
parent = {y: 0.1 * (y - 1995) for y in range(1995, 2001)} # flat-ish 0.1/yr
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert out[1995] == child[1995] # first year preserved
# 1995->1998: 0.0667/yr, well within 0.1 +/- 0.40 -> untouched.
assert abs(out[1998] - child[1998]) < 1e-12
# 1998->1999: 2.27/yr clamped to parent_rate + max_dev = 0.1 + 0.40.
assert abs((out[1999] - out[1998]) - (0.1 + MAX_STEP_DEVIATION_PER_YEAR)) < 1e-12
# 1999->2000: the in-band +0.03 step survives; the level shifts down with
# the clamped 1999.
assert abs((out[2000] - out[1999]) - (child[2000] - child[1999])) < 1e-12
assert abs(out[2000] - 0.73) < 1e-12
def test_winsorize_preserves_genuine_moves():
"""Steps within parent_rate +/- max_dev pass through (numerically) unchanged."""
child = {2000: 0.0, 2001: 0.35, 2002: 0.40, 2003: 0.20}
parent = {y: 0.05 * (y - 2000) for y in range(2000, 2004)}
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert set(out) == set(child)
assert max(abs(out[y] - child[y]) for y in child) < 1e-12
def test_winsorize_judges_gap_steps_on_per_year_rate():
"""A step across a multi-year gap is judged on its PER-YEAR rate (with
gap-tolerant parent lookup via _base_value), not as one giant single-year
move: +1.0 over 5 years (0.2/yr) is in-band even though +1.0 in one year
would be clamped."""
child = {1995: 0.0, 2000: 1.0}
# Parent lacks both endpoint years: 1995 back-fills to its earliest value
# (0.0), 2000 forward-fills from 1999 (0.3) -> parent rate 0.06/yr.
parent = {1996: 0.0, 1999: 0.3}
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert out == child
def test_winsorize_degenerate_inputs_unchanged():
"""<2 child years -> no steps to clamp; an empty parent only occurs in
degenerate paths (build_index always passes the national index) -> child
is returned unchanged, never clamped against an arbitrary rate."""
assert winsorize_steps({}, {2000: 0.0, 2001: 0.1}, 0.4) == {}
assert winsorize_steps({2000: 0.5}, {2000: 0.0, 2001: 0.1}, 0.4) == {2000: 0.5}
spiky = {2000: 0.0, 2001: 5.0}
assert winsorize_steps(spiky, {}, 0.4) == spiky