Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
LATEST_COMPLETE_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
# Solve the index only on COMPLETE calendar years: exclude the partial
# current year, whose thin repeat-sale set yields wild betas. The index is
# still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
# follows the established trend rather than a partial-year spike. Backtest
# passes a stricter max_pair_year, which is honoured.
estimation_cap = (
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
)
pairs = extract_pairs(input_path, max_year2=estimation_cap)
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
hedonic_idx = compute_hedonic_index(
input_path, min_year, max_year, max_sale_year=max_pair_year
input_path, min_year, max_year, max_sale_year=estimation_cap
)
# Precompute hierarchy