idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -11,9 +11,9 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
|
||||
from pipeline.transform.price_estimation.index import build_index
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
|
|
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
# Keep null when the index can't be interpolated, matching production
|
||||
# (estimate.py ships null there). compute_metrics filters to finite
|
||||
# positive predictions, so these rows correctly drop from the Index n
|
||||
# rather than silently degrading to the Naive prediction.
|
||||
.alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
|
@ -265,13 +268,12 @@ def main():
|
|||
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# Blend: (1-w)*index + w*kNN where both available
|
||||
# Blend with the exact shipped estimator (stability gate + last-price cap +
|
||||
# null-when-no-index) so the "Blended" stage reflects production accuracy.
|
||||
# input_price is the backtest equivalent of production's "Last known price".
|
||||
index_est = test["predicted"].to_numpy().astype(np.float64)
|
||||
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
|
||||
blended = np.where(
|
||||
knn_valid & np.isfinite(index_est),
|
||||
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
|
||||
np.where(np.isfinite(index_est), index_est, knn_est),
|
||||
blended = guarded_blend_estimates(
|
||||
index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
|
||||
)
|
||||
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ from tqdm import tqdm
|
|||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
|
|
@ -431,6 +433,17 @@ def build_index(
|
|||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Re-anchor every repeat-sales dict to the global base year before any
|
||||
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||
# are measured from a later origin; combining them key-by-key would
|
||||
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||
# already anchored at min_year, so we align everything to min_year.
|
||||
national_idx = reanchor_dict(national_idx, min_year)
|
||||
area_idx = reanchor_dicts(area_idx, min_year)
|
||||
district_idx = reanchor_dicts(district_idx, min_year)
|
||||
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
|
|||
SPATIAL_BLEND_K = 30
|
||||
|
||||
|
||||
def _base_value(index: dict[int, float], base_year: int) -> float:
|
||||
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
|
||||
|
||||
Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
|
||||
values are log-levels relative to that origin. To express it on a common
|
||||
origin we need its value at the shared `base_year`:
|
||||
- exact hit: use it directly;
|
||||
- base_year before the dict's history: back-fill, i.e. the earliest known
|
||||
value (which is 0.0 by construction). We cannot observe the level move
|
||||
between the global base and a later-starting cell, so we assume none,
|
||||
matching forward_fill's back-fill convention;
|
||||
- base_year inside a gap / after history: forward-fill the most recent
|
||||
prior value.
|
||||
"""
|
||||
if base_year in index:
|
||||
return index[base_year]
|
||||
years = sorted(index)
|
||||
if not years or base_year < years[0]:
|
||||
return index[years[0]] if years else 0.0
|
||||
prior = [y for y in years if y <= base_year]
|
||||
return index[prior[-1]]
|
||||
|
||||
|
||||
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
|
||||
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
|
||||
|
||||
Subtracting the same constant from every year preserves all within-dict
|
||||
year-to-year differences, so estimate.py's (current - sale) semantics are
|
||||
unchanged; it only fixes the cross-dict level mismatch before blending.
|
||||
"""
|
||||
if not index:
|
||||
return index
|
||||
shift = _base_value(index, base_year)
|
||||
if shift == 0.0:
|
||||
return index
|
||||
return {y: v - shift for y, v in index.items()}
|
||||
|
||||
|
||||
def reanchor_dicts(
|
||||
indices: dict[str, dict[int, float]], base_year: int
|
||||
) -> dict[str, dict[int, float]]:
|
||||
"""Re-anchor every index dict in a mapping to the common `base_year`."""
|
||||
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
"""Shrink dict values toward parent using n/(n+k) weighting.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue