Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
@ -165,12 +166,50 @@ def solve_robust_index(
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
if n_pen:
all_data = np.concatenate([data, pen_vals_arr])
all_rows = np.concatenate([rows_arr, pen_rows_arr])
all_cols = np.concatenate([cols_arr, pen_cols_arr])
b = np.concatenate([log_ratios * weights, pen_b])
else:
all_data, all_rows, all_cols = data, rows_arr, cols_arr
b = log_ratios * weights
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals