perfect-postcode/pipeline/transform/price_estimation/test_index.py

from datetime import date

import numpy as np
import polars as pl

from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
    MAX_EXTRAPOLATION_SLOPE,
    build_index,
    compute_indices_for_level,
    extract_pairs,
    forward_fill,
    solve_robust_index,
)
from pipeline.transform.price_estimation.utils import CURRENT_YEAR, TYPE_GROUPS


def _pairs_from_path(true_levels: dict[int, float]):
    """Build adjacent-year repeat-sale pairs that exactly trace a known path.

    Each consecutive pair's log_ratio is the difference of the true log-levels,
    so the solver should recover the levels exactly (relative to the min year).
    """
    years = sorted(true_levels)
    y1, y2, lr, w = [], [], [], []
    for a, b in zip(years[:-1], years[1:]):
        y1.append(a)
        y2.append(b)
        lr.append(true_levels[b] - true_levels[a])
        w.append(1.0)
    return (
        np.array(y1, dtype=np.int32),
        np.array(y2, dtype=np.int32),
        np.array(lr, dtype=np.float64),
        np.array(w, dtype=np.float64),
    )


def test_solver_recovers_contiguous_path():
    """A contiguous price path is recovered as log-levels relative to min_year.

    Proves the IRLS solver is correct (and unchanged) for contiguous data: the
    spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
    """
    years = range(2010, 2021)
    true = {y: 0.04 * (y - 2010) for y in years}  # smooth (zero curvature) ramp
    # Replicate each adjacent pair so MIN_PAIRS is comfortably met.
    y1, y2, lr, w = _pairs_from_path(true)
    y1 = np.tile(y1, 3)
    y2 = np.tile(y2, 3)
    lr = np.tile(lr, 3)
    w = np.tile(w, 3)

    idx = solve_robust_index(y1, y2, lr, w)

    assert idx[2010] == 0.0  # baseline anchor
    for y in years:
        assert abs(idx[y] - (true[y] - true[2010])) < 1e-3


def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
    """FIX #5: a sharp true level jump across a multi-year gap is preserved.

    Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
    sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
    (beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
    the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
    spacing-aware second difference relaxes the penalty across the gap.
    """
    # True log-levels relative to min_year (2000 anchored at 0).
    true = {
        2000: 0.0,
        2001: 0.05,
        2002: 0.10,
        2015: 1.10,  # sharp +1.0 jump across the gap
        2016: 1.15,
    }

    y1, y2, lr, w = [], [], [], []

    def add(a, b, n=4):
        for _ in range(n):
            y1.append(a)
            y2.append(b)
            lr.append(true[b] - true[a])
            w.append(1.0)

    # In-segment adjacent pairs.
    add(2000, 2001)
    add(2001, 2002)
    add(2015, 2016)
    # Cross-gap pairs consistent with the sharp jump.
    add(2002, 2015)
    add(2002, 2016)

    y1 = np.array(y1, dtype=np.int32)
    y2 = np.array(y2, dtype=np.int32)
    lr = np.array(lr, dtype=np.float64)
    w = np.array(w, dtype=np.float64)

    # Use a strong penalty to make the smoothing bias obvious.
    original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
    index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
    try:
        idx = solve_robust_index(y1, y2, lr, w)
    finally:
        index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original

    assert idx[2000] == 0.0  # baseline anchor
    # beta_2015 must stay near its true post-gap level, not get dragged down by a
    # spurious curvature penalty that treats the gap as a single-year step.
    assert abs(idx[2015] - true[2015]) < 0.05


def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
    """Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
    pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
    years = range(2010, 2021)
    true = {y: 0.04 * (y - 2010) for y in years}
    y1, y2, lr, w = [], [], [], []
    for a in range(2010, 2020):
        for _ in range(ramp_reps):
            y1.append(a)
            y2.append(a + 1)
            lr.append(true[a + 1] - true[a])
            w.append(1.0)
    for _ in range(tail_n):
        y1.append(2020)
        y2.append(2021)
        lr.append(tail_ratio)
        w.append(1.0)
    return (
        np.array(y1, dtype=np.int32),
        np.array(y2, dtype=np.int32),
        np.array(lr, dtype=np.float64),
        np.array(w, dtype=np.float64),
    )


def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
    """A final year identified by a SINGLE pair claiming a +1.5 log jump is
    pulled strongly toward the local trend; with the flat baseline penalty
    (support scaling off) the jump survives almost entirely. The thin year is
    the LAST year of the range (only ever at a penalty triple's edge), proving
    the min-over-triple support rule covers range edges -- the last solved year
    feeds the CURRENT_YEAR trend extrapolation."""
    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)

    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
    flat = solve_robust_index(y1, y2, lr, w)
    monkeypatch.undo()
    scaled = solve_robust_index(y1, y2, lr, w)

    flat_step = flat[2021] - flat[2020]
    scaled_step = scaled[2021] - scaled[2020]
    assert flat_step > 1.2  # flat lambda barely resists the spike
    assert scaled_step < 0.65  # support-scaled lambda suppresses it
    # The well-supported ramp stays close to truth: the strong penalty row
    # spanning the thin year drags its immediate neighbour slightly (<0.1)
    # toward collinearity -- the price of suppressing a x4.5 one-year spike.
    for y in range(2010, 2021):
        assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1


def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
    """With ample pairs everywhere (support 50-100 per year), lambda_eff ~
    lambda0 and the solution matches the flat-penalty solve to <1e-3."""
    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)

    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
    flat = solve_robust_index(y1, y2, lr, w)
    monkeypatch.undo()
    scaled = solve_robust_index(y1, y2, lr, w)

    assert set(flat) == set(scaled)
    assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3


def test_forward_fill_extrapolation_uses_robust_median_slope():
    """A residual spike in ONE recent year must not corrupt the extrapolated
    step: the median of consecutive per-year slopes ignores it (a least-squares
    fit through the same points would extrapolate a large positive slope)."""
    index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
    filled = forward_fill(index, 2022, 2026)
    # slopes: [+0.05, +0.55, -0.50] -> median +0.05
    assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9


def test_forward_fill_extrapolated_slope_is_clamped():
    """A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
    index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
    filled = forward_fill(index, 2022, 2026)
    assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9

    index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
    filled_down = forward_fill(index_down, 2022, 2026)
    assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9


def test_forward_fill_preserves_sane_trend_and_flat_fallback():
    """Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
    trend); with <2 recent points the fill is flat."""
    index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
    filled = forward_fill(index, 2022, 2026)
    assert abs(filled[2026] - 1.20) < 1e-9

    assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7


def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
    """A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
    error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
    Such pairs are dropped via the annualised cap; large ratios over long
    holding periods (genuine appreciation) are kept."""
    df = pl.DataFrame(
        {
            "Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
            "Property type": ["Detached", "Detached", "Detached"],
            "historical_prices": [
                # +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
                [
                    {"year": 2020, "month": 1, "price": 100_000},
                    {"year": 2020, "month": 7, "price": 1_000_000},
                ],
                # +2.20 log over 24 years -> kept (flat 3.0 cap governs)
                [
                    {"year": 2000, "month": 1, "price": 100_000},
                    {"year": 2024, "month": 1, "price": 900_000},
                ],
                # +0.41 log in 1 year -> kept (within the 0.7/yr band)
                [
                    {"year": 2020, "month": 1, "price": 100_000},
                    {"year": 2021, "month": 1, "price": 150_000},
                ],
            ],
        }
    )
    path = tmp_path / "props.parquet"
    df.write_parquet(path)

    pairs = extract_pairs(path)

    assert len(pairs) == 2
    ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
    assert ratios == [0.41, 2.2]


def test_n_pairs_counts_only_cross_year_pairs():
    """FIX #12: same-year pairs carry zero index information and must not inflate
    the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
    rows = []

    def add_pairs(group, year1, year2, n):
        for _ in range(n):
            rows.append(
                {
                    "grp": group,
                    "year1": year1,
                    "year2": year2,
                    "log_ratio": 0.03 * (year2 - year1),
                    "weight": 1.0,
                }
            )

    # 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
    # zero-information same-year pairs that must not be counted.
    add_pairs("g", 2010, 2011, 4)
    add_pairs("g", 2011, 2012, 4)
    add_pairs("g", 2012, 2012, 3)  # same-year, zero info

    pairs = pl.DataFrame(rows)
    indices, n_pairs = compute_indices_for_level(pairs, "grp")

    assert "g" in indices
    assert n_pairs["g"] == 8  # not 11


def _write_universe_fixtures(tmp_path):
    """Properties with repeat sales only in sector 'AB1 2', plus a postcode
    universe that also contains the pairless sector 'AB1 3'."""
    props = pl.DataFrame(
        {
            "Postcode": [f"AB1 2A{c}" for c in "ABCDEF"],
            "Property type": ["Detached"] * 6,
            "Total floor area (sqm)": [80.0] * 6,
            "Last known price": [130_000] * 6,
            "Date of last transaction": [date(2021, 6, 1)] * 6,
            # 6 repeat-sale pairs 2018 -> 2021, log_ratio ~0.26 (well within
            # the flat and annualised outlier caps), comfortably >= MIN_PAIRS.
            "historical_prices": [
                [
                    {"year": 2018, "month": 1, "price": 100_000},
                    {"year": 2021, "month": 6, "price": 130_000},
                ]
            ]
            * 6,
        }
    )
    props_path = tmp_path / "props.parquet"
    props.write_parquet(props_path)

    postcodes = pl.DataFrame(
        {
            "Postcode": ["AB1 2AA", "AB1 2AB", "AB1 3AA"],
            "lat": [57.10, 57.10, 57.20],
            "lon": [-2.10, -2.10, -2.20],
        }
    )
    pc_path = tmp_path / "postcodes.parquet"
    postcodes.write_parquet(pc_path)
    return props_path, pc_path


def test_build_index_covers_pairless_sectors_from_postcode_universe(tmp_path):
    """FIX: the sector universe is pairs-sectors UNION postcode-universe
    sectors, not just sectors that happened to have a repeat sale (which
    silently dropped ~15% of live sectors from the output). A pairless sector
    present in postcode.parquet must get index rows via the hierarchy
    fallback: n_pairs == 0 marks the synthesised cells, with full year
    coverage after forward fill."""
    props_path, pc_path = _write_universe_fixtures(tmp_path)

    result = build_index(props_path, postcodes_path=pc_path)

    pairless = result.filter(pl.col("sector") == "AB1 3")
    assert len(pairless) > 0
    assert set(pairless["type_group"]) == {"All", *TYPE_GROUPS}
    assert pairless["n_pairs"].to_list() == [0] * len(pairless)
    assert pairless["log_index"].is_not_null().all()
    # Full year coverage (min pair year .. CURRENT_YEAR) for the solved type
    # groups. (Type groups with <MIN_PAIRS pairs take the hedonic-fallback
    # skip branch, which only emits hedonic years -- unchanged behaviour.)
    expected_years = set(range(2018, CURRENT_YEAR + 1))
    for tg in ("All", "Detached"):
        years = set(pairless.filter(pl.col("type_group") == tg)["year"])
        assert years == expected_years

    # The pairless sector inherits its district's index: same values as the
    # sector that actually has pairs (no other siblings to dilute it here).
    with_pairs = result.filter(pl.col("sector") == "AB1 2")
    assert (
        with_pairs.filter(pl.col("type_group") == "All")["n_pairs"].to_list()
        == [6] * (CURRENT_YEAR - 2018 + 1)
    )


def test_build_index_sectors_scoping_restricts_universe(tmp_path):
    """Debug scoping via sectors=[...] restricts the output universe too --
    not just the pairs -- so a scoped run does not emit every centroid sector."""
    props_path, pc_path = _write_universe_fixtures(tmp_path)

    result = build_index(props_path, postcodes_path=pc_path, sectors=["AB1 2"])

    assert set(result["sector"]) == {"AB1 2"}