idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -19,8 +19,7 @@ from tqdm import tqdm
 from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
-    reanchor_dict,
-    reanchor_dicts,
+    lift_onto_parent,
    shrink_dicts,
    spatial_smooth,
 )
@ -169,33 +168,47 @@ def solve_robust_index(
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

    # Temporal smoothness prior: penalise curvature in the year betas with a
-    # second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
-    # encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
-    # zero target). This damps single-year index spikes without flattening
-    # genuine multi-year trends. Betas are ordered by calendar year; the baseline
-    # year (min_year, implicit beta=0) has no column, so the penalty spans the
-    # non-baseline years only. For cells with <3 betas there is no curvature to
-    # penalise and the solve is unchanged.
+    # second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
+    # least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
+    # The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
+    # for the consecutive triple (y0, y1, y2), so gap years are not treated as
+    # adjacent: a multi-year gap relaxes the penalty (correctly preserving a
+    # genuine level jump) instead of forcing a smooth ramp. For unit spacing
+    # (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
+    # This damps single-year index spikes without flattening genuine trends.
+    # Betas are ordered by calendar year; the baseline year (min_year, implicit
+    # beta=0) has no column, so the penalty spans the non-baseline years only.
+    # For cells with <3 betas there is no curvature to penalise and the solve is
+    # unchanged.
    n_pen = 0
    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
-        cols_by_year = [c for _, c in sorted(year_to_col.items())]
+        years_sorted = sorted(year_to_col)
+        cols_by_year = [year_to_col[y] for y in years_sorted]
        n_pen = n_cols - 2
        pen_rows = np.repeat(n + np.arange(n_pen), 3)
        pen_cols = np.empty(n_pen * 3, dtype=np.int64)
+        pen_vals = np.empty(n_pen * 3, dtype=np.float64)
        for k in range(n_pen):
            pen_cols[3 * k : 3 * k + 3] = (
                cols_by_year[k],
                cols_by_year[k + 1],
                cols_by_year[k + 2],
            )
+            y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
+            w0 = 2.0 / ((y1 - y0) * (y2 - y0))
+            w1 = -2.0 / ((y1 - y0) * (y2 - y1))
+            w2 = 2.0 / ((y2 - y1) * (y2 - y0))
+            pen_vals[3 * k : 3 * k + 3] = (
+                sqrt_lambda * w0,
+                sqrt_lambda * w1,
+                sqrt_lambda * w2,
+            )
        pen_rows_arr = pen_rows.astype(np.int64)
        pen_cols_arr = pen_cols
-        pen_vals_arr = np.tile(
-            [sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
-        ).astype(np.float64)
+        pen_vals_arr = pen_vals
        pen_b = np.zeros(n_pen, dtype=np.float64)
    n_total_rows = n + n_pen

@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
        idx = solve_robust_index(y1, y2, lr, w)
        if idx:
            indices[key] = idx
-            n_pairs[key] = len(y1)
+            # Count only information-bearing pairs: same-year (year1==year2) and
+            # baseline-baseline pairs cancel in the sparse solve and contribute
+            # zero information to the annual index, so including them would
+            # inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
+            n_pairs[key] = int(np.count_nonzero(y2 != y1))
    return indices, n_pairs


@ -433,20 +450,17 @@ def build_index(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

-        # Re-anchor every repeat-sales dict to the global base year before any
-        # shrinkage/smoothing/blending. solve_robust_index anchors each cell to
-        # log-index 0 at its OWN earliest year, so cells with shorter histories
-        # are measured from a later origin; combining them key-by-key would
-        # otherwise average level-incompatible numbers. The hedonic fallback is
-        # already anchored at min_year, so we align everything to min_year.
-        national_idx = reanchor_dict(national_idx, min_year)
-        area_idx = reanchor_dicts(area_idx, min_year)
-        district_idx = reanchor_dicts(district_idx, min_year)
-        sector_idx = reanchor_dicts(sector_idx, min_year)
-
-        # Shrinkage: national -> hedonic first, then hierarchical
+        # Shrinkage: national -> hedonic first, then hierarchical. Each cell is
+        # anchored to log-index 0 at its OWN earliest year (solve_robust_index),
+        # so cells with shorter histories sit on a later origin than their wider
+        # parents. Before each blend we lift the child onto its parent's base at
+        # the child's first year (lift_onto_parent) -- otherwise combining them
+        # key-by-key averages level-incompatible numbers. The hedonic fallback is
+        # anchored at the global min_year, so it serves as the base for national.
        print("  Applying shrinkage...")
-        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
+        national_shrunk = shrink_dicts(
+            lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
+        )
        sector_shrunk = hierarchical_shrinkage(
            sector_idx,
            sector_n,
@ -459,6 +473,7 @@ def build_index(
            sector_to_dist,
            dist_to_area,
            shrink_dicts,
+            lift_onto_parent,
        )

        # Spatial smoothing