idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -11,9 +11,9 @@ from pathlib import Path
 import numpy as np
 import polars as pl

+from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
 from pipeline.transform.price_estimation.index import build_index
 from pipeline.transform.price_estimation.knn import (
-    KNN_BLEND_WEIGHT,
    build_knn_pool,
    knn_median_psm,
 )
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
            .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
            .exp()
        )
-        .fill_null(pl.col("input_price").cast(pl.Float64))
+        # Keep null when the index can't be interpolated, matching production
+        # (estimate.py ships null there). compute_metrics filters to finite
+        # positive predictions, so these rows correctly drop from the Index n
+        # rather than silently degrading to the Naive prediction.
        .alias("predicted"),
    )
    return test
@ -265,13 +268,12 @@ def main():
        f"  kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
    )

-    # Blend: (1-w)*index + w*kNN where both available
+    # Blend with the exact shipped estimator (stability gate + last-price cap +
+    # null-when-no-index) so the "Blended" stage reflects production accuracy.
+    # input_price is the backtest equivalent of production's "Last known price".
    index_est = test["predicted"].to_numpy().astype(np.float64)
-    knn_valid = np.isfinite(knn_est) & (knn_est > 0)
-    blended = np.where(
-        knn_valid & np.isfinite(index_est),
-        (1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
-        np.where(np.isfinite(index_est), index_est, knn_est),
+    blended = guarded_blend_estimates(
+        index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
    )

    actual = test["actual_price"].to_numpy().astype(np.float64)
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -19,6 +19,8 @@ from tqdm import tqdm
 from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
+    reanchor_dict,
+    reanchor_dicts,
    shrink_dicts,
    spatial_smooth,
 )
@ -431,6 +433,17 @@ def build_index(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

+        # Re-anchor every repeat-sales dict to the global base year before any
+        # shrinkage/smoothing/blending. solve_robust_index anchors each cell to
+        # log-index 0 at its OWN earliest year, so cells with shorter histories
+        # are measured from a later origin; combining them key-by-key would
+        # otherwise average level-incompatible numbers. The hedonic fallback is
+        # already anchored at min_year, so we align everything to min_year.
+        national_idx = reanchor_dict(national_idx, min_year)
+        area_idx = reanchor_dicts(area_idx, min_year)
+        district_idx = reanchor_dicts(district_idx, min_year)
+        sector_idx = reanchor_dicts(sector_idx, min_year)
+
        # Shrinkage: national -> hedonic first, then hierarchical
        print("  Applying shrinkage...")
        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
 SPATIAL_BLEND_K = 30


+def _base_value(index: dict[int, float], base_year: int) -> float:
+    """Value of an index dict at `base_year`, with forward/back-fill for gaps.
+
+    Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
+    values are log-levels relative to that origin. To express it on a common
+    origin we need its value at the shared `base_year`:
+      - exact hit: use it directly;
+      - base_year before the dict's history: back-fill, i.e. the earliest known
+        value (which is 0.0 by construction). We cannot observe the level move
+        between the global base and a later-starting cell, so we assume none,
+        matching forward_fill's back-fill convention;
+      - base_year inside a gap / after history: forward-fill the most recent
+        prior value.
+    """
+    if base_year in index:
+        return index[base_year]
+    years = sorted(index)
+    if not years or base_year < years[0]:
+        return index[years[0]] if years else 0.0
+    prior = [y for y in years if y <= base_year]
+    return index[prior[-1]]
+
+
+def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
+    """Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
+
+    Subtracting the same constant from every year preserves all within-dict
+    year-to-year differences, so estimate.py's (current - sale) semantics are
+    unchanged; it only fixes the cross-dict level mismatch before blending.
+    """
+    if not index:
+        return index
+    shift = _base_value(index, base_year)
+    if shift == 0.0:
+        return index
+    return {y: v - shift for y, v in index.items()}
+
+
+def reanchor_dicts(
+    indices: dict[str, dict[int, float]], base_year: int
+) -> dict[str, dict[int, float]]:
+    """Re-anchor every index dict in a mapping to the common `base_year`."""
+    return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
+
+
 def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
    """Shrink dict values toward parent using n/(n+k) weighting.