idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -19,8 +19,7 @@ from tqdm import tqdm
|
|||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
|
|
@ -169,33 +168,47 @@ def solve_robust_index(
|
|||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
|
||||
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
|
||||
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
|
||||
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
|
||||
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
|
||||
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
|
||||
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
|
||||
# This damps single-year index spikes without flattening genuine trends.
|
||||
# Betas are ordered by calendar year; the baseline year (min_year, implicit
|
||||
# beta=0) has no column, so the penalty spans the non-baseline years only.
|
||||
# For cells with <3 betas there is no curvature to penalise and the solve is
|
||||
# unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
years_sorted = sorted(year_to_col)
|
||||
cols_by_year = [year_to_col[y] for y in years_sorted]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
|
||||
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
|
||||
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
|
||||
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
|
||||
pen_vals[3 * k : 3 * k + 3] = (
|
||||
sqrt_lambda * w0,
|
||||
sqrt_lambda * w1,
|
||||
sqrt_lambda * w2,
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_vals_arr = pen_vals
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
|
|
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
|||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs[key] = len(y1)
|
||||
# Count only information-bearing pairs: same-year (year1==year2) and
|
||||
# baseline-baseline pairs cancel in the sparse solve and contribute
|
||||
# zero information to the annual index, so including them would
|
||||
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
|
||||
n_pairs[key] = int(np.count_nonzero(y2 != y1))
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
|
|
@ -433,20 +450,17 @@ def build_index(
|
|||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Re-anchor every repeat-sales dict to the global base year before any
|
||||
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||
# are measured from a later origin; combining them key-by-key would
|
||||
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||
# already anchored at min_year, so we align everything to min_year.
|
||||
national_idx = reanchor_dict(national_idx, min_year)
|
||||
area_idx = reanchor_dicts(area_idx, min_year)
|
||||
district_idx = reanchor_dicts(district_idx, min_year)
|
||||
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
|
||||
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
|
||||
# so cells with shorter histories sit on a later origin than their wider
|
||||
# parents. Before each blend we lift the child onto its parent's base at
|
||||
# the child's first year (lift_onto_parent) -- otherwise combining them
|
||||
# key-by-key averages level-incompatible numbers. The hedonic fallback is
|
||||
# anchored at the global min_year, so it serves as the base for national.
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
national_shrunk = shrink_dicts(
|
||||
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
|
||||
)
|
||||
sector_shrunk = hierarchical_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
|
|
@ -459,6 +473,7 @@ def build_index(
|
|||
sector_to_dist,
|
||||
dist_to_area,
|
||||
shrink_dicts,
|
||||
lift_onto_parent,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue