Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -3,7 +3,10 @@ import polars as pl
|
|||
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
from pipeline.transform.price_estimation.index import (
|
||||
MAX_EXTRAPOLATION_SLOPE,
|
||||
compute_indices_for_level,
|
||||
extract_pairs,
|
||||
forward_fill,
|
||||
solve_robust_index,
|
||||
)
|
||||
|
||||
|
|
@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
|
|||
assert abs(idx[2015] - true[2015]) < 0.05
|
||||
|
||||
|
||||
def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
|
||||
"""Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
|
||||
pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
|
||||
years = range(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years}
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for a in range(2010, 2020):
|
||||
for _ in range(ramp_reps):
|
||||
y1.append(a)
|
||||
y2.append(a + 1)
|
||||
lr.append(true[a + 1] - true[a])
|
||||
w.append(1.0)
|
||||
for _ in range(tail_n):
|
||||
y1.append(2020)
|
||||
y2.append(2021)
|
||||
lr.append(tail_ratio)
|
||||
w.append(1.0)
|
||||
return (
|
||||
np.array(y1, dtype=np.int32),
|
||||
np.array(y2, dtype=np.int32),
|
||||
np.array(lr, dtype=np.float64),
|
||||
np.array(w, dtype=np.float64),
|
||||
)
|
||||
|
||||
|
||||
def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
|
||||
"""A final year identified by a SINGLE pair claiming a +1.5 log jump is
|
||||
pulled strongly toward the local trend; with the flat baseline penalty
|
||||
(support scaling off) the jump survives almost entirely. The thin year is
|
||||
the LAST year of the range (only ever at a penalty triple's edge), proving
|
||||
the min-over-triple support rule covers range edges -- the last solved year
|
||||
feeds the CURRENT_YEAR trend extrapolation."""
|
||||
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
|
||||
|
||||
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
|
||||
flat = solve_robust_index(y1, y2, lr, w)
|
||||
monkeypatch.undo()
|
||||
scaled = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
flat_step = flat[2021] - flat[2020]
|
||||
scaled_step = scaled[2021] - scaled[2020]
|
||||
assert flat_step > 1.2 # flat lambda barely resists the spike
|
||||
assert scaled_step < 0.65 # support-scaled lambda suppresses it
|
||||
# The well-supported ramp stays close to truth: the strong penalty row
|
||||
# spanning the thin year drags its immediate neighbour slightly (<0.1)
|
||||
# toward collinearity -- the price of suppressing a x4.5 one-year spike.
|
||||
for y in range(2010, 2021):
|
||||
assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
|
||||
|
||||
|
||||
def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
|
||||
"""With ample pairs everywhere (support 50-100 per year), lambda_eff ~
|
||||
lambda0 and the solution matches the flat-penalty solve to <1e-3."""
|
||||
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
|
||||
|
||||
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
|
||||
flat = solve_robust_index(y1, y2, lr, w)
|
||||
monkeypatch.undo()
|
||||
scaled = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
assert set(flat) == set(scaled)
|
||||
assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
|
||||
|
||||
|
||||
def test_forward_fill_extrapolation_uses_robust_median_slope():
|
||||
"""A residual spike in ONE recent year must not corrupt the extrapolated
|
||||
step: the median of consecutive per-year slopes ignores it (a least-squares
|
||||
fit through the same points would extrapolate a large positive slope)."""
|
||||
index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
# slopes: [+0.05, +0.55, -0.50] -> median +0.05
|
||||
assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
|
||||
|
||||
|
||||
def test_forward_fill_extrapolated_slope_is_clamped():
|
||||
"""A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
|
||||
index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
|
||||
|
||||
index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
|
||||
filled_down = forward_fill(index_down, 2022, 2026)
|
||||
assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
|
||||
|
||||
|
||||
def test_forward_fill_preserves_sane_trend_and_flat_fallback():
|
||||
"""Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
|
||||
trend); with <2 recent points the fill is flat."""
|
||||
index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
assert abs(filled[2026] - 1.20) < 1e-9
|
||||
|
||||
assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
|
||||
|
||||
|
||||
def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
|
||||
"""A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
|
||||
error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
|
||||
Such pairs are dropped via the annualised cap; large ratios over long
|
||||
holding periods (genuine appreciation) are kept."""
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
|
||||
"Property type": ["Detached", "Detached", "Detached"],
|
||||
"historical_prices": [
|
||||
# +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
|
||||
[
|
||||
{"year": 2020, "month": 1, "price": 100_000},
|
||||
{"year": 2020, "month": 7, "price": 1_000_000},
|
||||
],
|
||||
# +2.20 log over 24 years -> kept (flat 3.0 cap governs)
|
||||
[
|
||||
{"year": 2000, "month": 1, "price": 100_000},
|
||||
{"year": 2024, "month": 1, "price": 900_000},
|
||||
],
|
||||
# +0.41 log in 1 year -> kept (within the 0.7/yr band)
|
||||
[
|
||||
{"year": 2020, "month": 1, "price": 100_000},
|
||||
{"year": 2021, "month": 1, "price": 150_000},
|
||||
],
|
||||
],
|
||||
}
|
||||
)
|
||||
path = tmp_path / "props.parquet"
|
||||
df.write_parquet(path)
|
||||
|
||||
pairs = extract_pairs(path)
|
||||
|
||||
assert len(pairs) == 2
|
||||
ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
|
||||
assert ratios == [0.41, 2.2]
|
||||
|
||||
|
||||
def test_n_pairs_counts_only_cross_year_pairs():
|
||||
"""FIX #12: same-year pairs carry zero index information and must not inflate
|
||||
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue