This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -19,8 +19,7 @@ from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
reanchor_dict,
reanchor_dicts,
lift_onto_parent,
shrink_dicts,
spatial_smooth,
)
@ -169,33 +168,47 @@ def solve_robust_index(
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
# This damps single-year index spikes without flattening genuine trends.
# Betas are ordered by calendar year; the baseline year (min_year, implicit
# beta=0) has no column, so the penalty spans the non-baseline years only.
# For cells with <3 betas there is no curvature to penalise and the solve is
# unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
years_sorted = sorted(year_to_col)
cols_by_year = [year_to_col[y] for y in years_sorted]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
pen_vals[3 * k : 3 * k + 3] = (
sqrt_lambda * w0,
sqrt_lambda * w1,
sqrt_lambda * w2,
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_vals_arr = pen_vals
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
# Count only information-bearing pairs: same-year (year1==year2) and
# baseline-baseline pairs cancel in the sparse solve and contribute
# zero information to the annual index, so including them would
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
n_pairs[key] = int(np.count_nonzero(y2 != y1))
return indices, n_pairs
@ -433,20 +450,17 @@ def build_index(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Re-anchor every repeat-sales dict to the global base year before any
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
# log-index 0 at its OWN earliest year, so cells with shorter histories
# are measured from a later origin; combining them key-by-key would
# otherwise average level-incompatible numbers. The hedonic fallback is
# already anchored at min_year, so we align everything to min_year.
national_idx = reanchor_dict(national_idx, min_year)
area_idx = reanchor_dicts(area_idx, min_year)
district_idx = reanchor_dicts(district_idx, min_year)
sector_idx = reanchor_dicts(sector_idx, min_year)
# Shrinkage: national -> hedonic first, then hierarchical
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
# so cells with shorter histories sit on a later origin than their wider
# parents. Before each blend we lift the child onto its parent's base at
# the child's first year (lift_onto_parent) -- otherwise combining them
# key-by-key averages level-incompatible numbers. The hedonic fallback is
# anchored at the global min_year, so it serves as the base for national.
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
national_shrunk = shrink_dicts(
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
)
sector_shrunk = hierarchical_shrinkage(
sector_idx,
sector_n,
@ -459,6 +473,7 @@ def build_index(
sector_to_dist,
dist_to_area,
shrink_dicts,
lift_onto_parent,
)
# Spatial smoothing