This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -11,9 +11,9 @@ from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
from pipeline.transform.price_estimation.index import build_index
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
build_knn_pool,
knn_median_psm,
)
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
)
.fill_null(pl.col("input_price").cast(pl.Float64))
# Keep null when the index can't be interpolated, matching production
# (estimate.py ships null there). compute_metrics filters to finite
# positive predictions, so these rows correctly drop from the Index n
# rather than silently degrading to the Naive prediction.
.alias("predicted"),
)
return test
@ -265,13 +268,12 @@ def main():
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
)
# Blend: (1-w)*index + w*kNN where both available
# Blend with the exact shipped estimator (stability gate + last-price cap +
# null-when-no-index) so the "Blended" stage reflects production accuracy.
# input_price is the backtest equivalent of production's "Last known price".
index_est = test["predicted"].to_numpy().astype(np.float64)
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
blended = np.where(
knn_valid & np.isfinite(index_est),
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
np.where(np.isfinite(index_est), index_est, knn_est),
blended = guarded_blend_estimates(
index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
)
actual = test["actual_price"].to_numpy().astype(np.float64)

View file

@ -19,6 +19,8 @@ from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
reanchor_dict,
reanchor_dicts,
shrink_dicts,
spatial_smooth,
)
@ -431,6 +433,17 @@ def build_index(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Re-anchor every repeat-sales dict to the global base year before any
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
# log-index 0 at its OWN earliest year, so cells with shorter histories
# are measured from a later origin; combining them key-by-key would
# otherwise average level-incompatible numbers. The hedonic fallback is
# already anchored at min_year, so we align everything to min_year.
national_idx = reanchor_dict(national_idx, min_year)
area_idx = reanchor_dicts(area_idx, min_year)
district_idx = reanchor_dicts(district_idx, min_year)
sector_idx = reanchor_dicts(sector_idx, min_year)
# Shrinkage: national -> hedonic first, then hierarchical
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)

View file

@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def _base_value(index: dict[int, float], base_year: int) -> float:
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
values are log-levels relative to that origin. To express it on a common
origin we need its value at the shared `base_year`:
- exact hit: use it directly;
- base_year before the dict's history: back-fill, i.e. the earliest known
value (which is 0.0 by construction). We cannot observe the level move
between the global base and a later-starting cell, so we assume none,
matching forward_fill's back-fill convention;
- base_year inside a gap / after history: forward-fill the most recent
prior value.
"""
if base_year in index:
return index[base_year]
years = sorted(index)
if not years or base_year < years[0]:
return index[years[0]] if years else 0.0
prior = [y for y in years if y <= base_year]
return index[prior[-1]]
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
Subtracting the same constant from every year preserves all within-dict
year-to-year differences, so estimate.py's (current - sale) semantics are
unchanged; it only fixes the cross-dict level mismatch before blending.
"""
if not index:
return index
shift = _base_value(index, base_year)
if shift == 0.0:
return index
return {y: v - shift for y, v in index.items()}
def reanchor_dicts(
indices: dict[str, dict[int, float]], base_year: int
) -> dict[str, dict[int, float]]:
"""Re-anchor every index dict in a mapping to the common `base_year`."""
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
"""Shrink dict values toward parent using n/(n+k) weighting.