Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
+    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
    extract_centroids,
@ -165,12 +166,50 @@ def solve_robust_index(
    cols_arr = np.concatenate([col2[mask2], col1[mask1]])
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

+    # Temporal smoothness prior: penalise curvature in the year betas with a
+    # second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
+    # encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
+    # zero target). This damps single-year index spikes without flattening
+    # genuine multi-year trends. Betas are ordered by calendar year; the baseline
+    # year (min_year, implicit beta=0) has no column, so the penalty spans the
+    # non-baseline years only. For cells with <3 betas there is no curvature to
+    # penalise and the solve is unchanged.
+    n_pen = 0
+    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
+    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
+    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
+        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
+        cols_by_year = [c for _, c in sorted(year_to_col.items())]
+        n_pen = n_cols - 2
+        pen_rows = np.repeat(n + np.arange(n_pen), 3)
+        pen_cols = np.empty(n_pen * 3, dtype=np.int64)
+        for k in range(n_pen):
+            pen_cols[3 * k : 3 * k + 3] = (
+                cols_by_year[k],
+                cols_by_year[k + 1],
+                cols_by_year[k + 2],
+            )
+        pen_rows_arr = pen_rows.astype(np.int64)
+        pen_cols_arr = pen_cols
+        pen_vals_arr = np.tile(
+            [sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
+        ).astype(np.float64)
+        pen_b = np.zeros(n_pen, dtype=np.float64)
+    n_total_rows = n + n_pen
+
    weights = base_weights.copy()

    for _ in range(IRLS_ITERATIONS):
        data = signs_arr * weights[rows_arr]
-        A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
-        b = log_ratios * weights
+        if n_pen:
+            all_data = np.concatenate([data, pen_vals_arr])
+            all_rows = np.concatenate([rows_arr, pen_rows_arr])
+            all_cols = np.concatenate([cols_arr, pen_cols_arr])
+            b = np.concatenate([log_ratios * weights, pen_b])
+        else:
+            all_data, all_rows, all_cols = data, rows_arr, cols_arr
+            b = log_ratios * weights
+        A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
        betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]

        # Residuals