import numpy as np import polars as pl from pipeline.transform.price_estimation import index as index_mod from pipeline.transform.price_estimation.index import ( MAX_EXTRAPOLATION_SLOPE, compute_indices_for_level, extract_pairs, forward_fill, solve_robust_index, ) def _pairs_from_path(true_levels: dict[int, float]): """Build adjacent-year repeat-sale pairs that exactly trace a known path. Each consecutive pair's log_ratio is the difference of the true log-levels, so the solver should recover the levels exactly (relative to the min year). """ years = sorted(true_levels) y1, y2, lr, w = [], [], [], [] for a, b in zip(years[:-1], years[1:]): y1.append(a) y2.append(b) lr.append(true_levels[b] - true_levels[a]) w.append(1.0) return ( np.array(y1, dtype=np.int32), np.array(y2, dtype=np.int32), np.array(lr, dtype=np.float64), np.array(w, dtype=np.float64), ) def test_solver_recovers_contiguous_path(): """A contiguous price path is recovered as log-levels relative to min_year. Proves the IRLS solver is correct (and unchanged) for contiguous data: the spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing. """ years = range(2010, 2021) true = {y: 0.04 * (y - 2010) for y in years} # smooth (zero curvature) ramp # Replicate each adjacent pair so MIN_PAIRS is comfortably met. y1, y2, lr, w = _pairs_from_path(true) y1 = np.tile(y1, 3) y2 = np.tile(y2, 3) lr = np.tile(lr, 3) w = np.tile(w, 3) idx = solve_robust_index(y1, y2, lr, w) assert idx[2010] == 0.0 # baseline anchor for y in years: assert abs(idx[y] - (true[y] - true[2010])) < 1e-3 def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp(): """FIX #5: a sharp true level jump across a multi-year gap is preserved. Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats (beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes the genuine level jump, biasing beta_2015 down toward a smooth ramp. The spacing-aware second difference relaxes the penalty across the gap. """ # True log-levels relative to min_year (2000 anchored at 0). true = { 2000: 0.0, 2001: 0.05, 2002: 0.10, 2015: 1.10, # sharp +1.0 jump across the gap 2016: 1.15, } y1, y2, lr, w = [], [], [], [] def add(a, b, n=4): for _ in range(n): y1.append(a) y2.append(b) lr.append(true[b] - true[a]) w.append(1.0) # In-segment adjacent pairs. add(2000, 2001) add(2001, 2002) add(2015, 2016) # Cross-gap pairs consistent with the sharp jump. add(2002, 2015) add(2002, 2016) y1 = np.array(y1, dtype=np.int32) y2 = np.array(y2, dtype=np.int32) lr = np.array(lr, dtype=np.float64) w = np.array(w, dtype=np.float64) # Use a strong penalty to make the smoothing bias obvious. original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0 try: idx = solve_robust_index(y1, y2, lr, w) finally: index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original assert idx[2000] == 0.0 # baseline anchor # beta_2015 must stay near its true post-gap level, not get dragged down by a # spurious curvature penalty that treats the gap as a single-year step. assert abs(idx[2015] - true[2015]) < 0.05 def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int): """Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump.""" years = range(2010, 2021) true = {y: 0.04 * (y - 2010) for y in years} y1, y2, lr, w = [], [], [], [] for a in range(2010, 2020): for _ in range(ramp_reps): y1.append(a) y2.append(a + 1) lr.append(true[a + 1] - true[a]) w.append(1.0) for _ in range(tail_n): y1.append(2020) y2.append(2021) lr.append(tail_ratio) w.append(1.0) return ( np.array(y1, dtype=np.int32), np.array(y2, dtype=np.int32), np.array(lr, dtype=np.float64), np.array(w, dtype=np.float64), ) def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch): """A final year identified by a SINGLE pair claiming a +1.5 log jump is pulled strongly toward the local trend; with the flat baseline penalty (support scaling off) the jump survives almost entirely. The thin year is the LAST year of the range (only ever at a penalty triple's edge), proving the min-over-triple support rule covers range edges -- the last solved year feeds the CURRENT_YEAR trend extrapolation.""" y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10) monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0) flat = solve_robust_index(y1, y2, lr, w) monkeypatch.undo() scaled = solve_robust_index(y1, y2, lr, w) flat_step = flat[2021] - flat[2020] scaled_step = scaled[2021] - scaled[2020] assert flat_step > 1.2 # flat lambda barely resists the spike assert scaled_step < 0.65 # support-scaled lambda suppresses it # The well-supported ramp stays close to truth: the strong penalty row # spanning the thin year drags its immediate neighbour slightly (<0.1) # toward collinearity -- the price of suppressing a x4.5 one-year spike. for y in range(2010, 2021): assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1 def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch): """With ample pairs everywhere (support 50-100 per year), lambda_eff ~ lambda0 and the solution matches the flat-penalty solve to <1e-3.""" y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50) monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0) flat = solve_robust_index(y1, y2, lr, w) monkeypatch.undo() scaled = solve_robust_index(y1, y2, lr, w) assert set(flat) == set(scaled) assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3 def test_forward_fill_extrapolation_uses_robust_median_slope(): """A residual spike in ONE recent year must not corrupt the extrapolated step: the median of consecutive per-year slopes ignores it (a least-squares fit through the same points would extrapolate a large positive slope).""" index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10} filled = forward_fill(index, 2022, 2026) # slopes: [+0.05, +0.55, -0.50] -> median +0.05 assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9 def test_forward_fill_extrapolated_slope_is_clamped(): """A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE.""" index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2} filled = forward_fill(index, 2022, 2026) assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9 index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0} filled_down = forward_fill(index_down, 2022, 2026) assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9 def test_forward_fill_preserves_sane_trend_and_flat_fallback(): """Genuine moderate trends still extrapolate (it stays a forward-FILL-with- trend); with <2 recent points the fill is flat.""" index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15} filled = forward_fill(index, 2022, 2026) assert abs(filled[2026] - 1.20) < 1e-9 assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7 def test_extract_pairs_drops_extreme_annualised_returns(tmp_path): """A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data error or non-market transfer with huge leverage (weight = 1/sqrt(gap)). Such pairs are dropped via the annualised cap; large ratios over long holding periods (genuine appreciation) are kept.""" df = pl.DataFrame( { "Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"], "Property type": ["Detached", "Detached", "Detached"], "historical_prices": [ # +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr) [ {"year": 2020, "month": 1, "price": 100_000}, {"year": 2020, "month": 7, "price": 1_000_000}, ], # +2.20 log over 24 years -> kept (flat 3.0 cap governs) [ {"year": 2000, "month": 1, "price": 100_000}, {"year": 2024, "month": 1, "price": 900_000}, ], # +0.41 log in 1 year -> kept (within the 0.7/yr band) [ {"year": 2020, "month": 1, "price": 100_000}, {"year": 2021, "month": 1, "price": 150_000}, ], ], } ) path = tmp_path / "props.parquet" df.write_parquet(path) pairs = extract_pairs(path) assert len(pairs) == 2 ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list()) assert ratios == [0.41, 2.2] def test_n_pairs_counts_only_cross_year_pairs(): """FIX #12: same-year pairs carry zero index information and must not inflate the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs.""" rows = [] def add_pairs(group, year1, year2, n): for _ in range(n): rows.append( { "grp": group, "year1": year1, "year2": year2, "log_ratio": 0.03 * (year2 - year1), "weight": 1.0, } ) # 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3 # zero-information same-year pairs that must not be counted. add_pairs("g", 2010, 2011, 4) add_pairs("g", 2011, 2012, 4) add_pairs("g", 2012, 2012, 3) # same-year, zero info pairs = pl.DataFrame(rows) indices, n_pairs = compute_indices_for_level(pairs, "grp") assert "g" in indices assert n_pairs["g"] == 8 # not 11