Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
+    LATEST_COMPLETE_YEAR,
    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
    """
-    pairs = extract_pairs(input_path, max_year2=max_pair_year)
+    # Solve the index only on COMPLETE calendar years: exclude the partial
+    # current year, whose thin repeat-sale set yields wild betas. The index is
+    # still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
+    # follows the established trend rather than a partial-year spike. Backtest
+    # passes a stricter max_pair_year, which is honoured.
+    estimation_cap = (
+        max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
+    )
+    pairs = extract_pairs(input_path, max_year2=estimation_cap)
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
    max_year = CURRENT_YEAR

    hedonic_idx = compute_hedonic_index(
-        input_path, min_year, max_year, max_sale_year=max_pair_year
+        input_path, min_year, max_year, max_sale_year=estimation_cap
    )

    # Precompute hierarchy