Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
|
|
@ -165,12 +166,50 @@ def solve_robust_index(
|
|||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
weights = base_weights.copy()
|
||||
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
if n_pen:
|
||||
all_data = np.concatenate([data, pen_vals_arr])
|
||||
all_rows = np.concatenate([rows_arr, pen_rows_arr])
|
||||
all_cols = np.concatenate([cols_arr, pen_cols_arr])
|
||||
b = np.concatenate([log_ratios * weights, pen_b])
|
||||
else:
|
||||
all_data, all_rows, all_cols = data, rows_arr, cols_arr
|
||||
b = log_ratios * weights
|
||||
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue