"""Hierarchical repeat-sales price index. Stratified by property type and postcode sector, with IRLS Huber regression, hierarchical shrinkage (sector → district → area → national → hedonic), and KD-tree spatial smoothing for sparse sectors. Output: price_index.parquet — sector x type_group x year -> log_index """ import argparse from pathlib import Path import numpy as np import polars as pl from scipy.sparse import csc_matrix from scipy.sparse.linalg import lsqr from tqdm import tqdm from pipeline.transform.price_estimation.shrinkage import ( blend_dicts, hierarchical_shrinkage, lift_onto_parent, shrink_dicts, spatial_smooth, ) from pipeline.transform.price_estimation.utils import ( CURRENT_YEAR, LATEST_COMPLETE_YEAR, TEMPORAL_SMOOTHNESS_LAMBDA, TYPE_GROUPS, build_hedonic_features, extract_centroids, hierarchy_keys, sector_expr, type_group_expr, ) MIN_PAIRS = 5 OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest HUBER_K = 1.345 IRLS_ITERATIONS = 5 def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFrame: """Extract consecutive repeat-sale pairs. If max_year2 is set, only pairs where year2 < max_year2 are included (for temporal holdout in backtesting). """ print("Extracting repeat-sale pairs...") df = ( pl.scan_parquet(input_path) .select("Postcode", "historical_prices", "Property type") .filter( pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2, ) .with_columns(sector_expr(), type_group_expr()) .collect() ) print(f" {len(df):,} properties with 2+ transactions") pairs = ( df.lazy() .with_columns( pl.col("historical_prices") .list.slice(0, pl.col("historical_prices").list.len() - 1) .alias("from_txn"), pl.col("historical_prices").list.slice(1).alias("to_txn"), ) .explode("from_txn", "to_txn") .with_columns( pl.col("from_txn").struct.field("year").alias("year1"), pl.col("from_txn").struct.field("month").alias("month1"), pl.col("from_txn").struct.field("price").alias("price1"), pl.col("to_txn").struct.field("year").alias("year2"), pl.col("to_txn").struct.field("month").alias("month2"), pl.col("to_txn").struct.field("price").alias("price2"), ) .with_columns( ( pl.col("year1").cast(pl.Float64) + (pl.col("month1").cast(pl.Float64) - 1.0) / 12.0 ).alias("frac_year1"), ( pl.col("year2").cast(pl.Float64) + (pl.col("month2").cast(pl.Float64) - 1.0) / 12.0 ).alias("frac_year2"), ) .select( "sector", "type_group", "year1", "price1", "year2", "price2", "frac_year1", "frac_year2", ) .filter( pl.col("price1") > 0, pl.col("price2") > 0, pl.col("frac_year2") > pl.col("frac_year1"), ) .with_columns( (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)) .log() .alias("log_ratio"), ( 1.0 / (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt() ).alias("weight"), ) .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD) .collect() ) if max_year2 is not None: pairs = pairs.filter(pl.col("year2") < max_year2) # Add hierarchy columns pairs = pairs.with_columns( pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"), ).with_columns( pl.col("district").str.replace(r"\d.*$", "").alias("area"), ) print(f" {len(pairs):,} pairs extracted") return pairs def solve_robust_index( years1: np.ndarray, years2: np.ndarray, log_ratios: np.ndarray, base_weights: np.ndarray, ) -> dict[int, float]: """IRLS Huber M-estimation for the Case-Shiller repeat-sales model.""" n = len(years1) if n < MIN_PAIRS: return {} all_years = np.union1d(years1, years2) min_year = int(all_years.min()) col = 0 year_to_col = {} for y in all_years: iy = int(y) if iy != min_year: year_to_col[iy] = col col += 1 n_cols = len(year_to_col) if n_cols == 0: return {} # Vectorized column index mapping col2 = np.full(n, -1, dtype=np.int32) col1 = np.full(n, -1, dtype=np.int32) for year, c in year_to_col.items(): col2[years2 == year] = c col1[years1 == year] = c # Sparse matrix structure (fixed across iterations) mask2 = col2 >= 0 mask1 = col1 >= 0 rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]]) cols_arr = np.concatenate([col2[mask2], col1[mask1]]) signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())]) # Temporal smoothness prior: penalise curvature in the year betas with a # second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra # least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target). # The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients # for the consecutive triple (y0, y1, y2), so gap years are not treated as # adjacent: a multi-year gap relaxes the penalty (correctly preserving a # genuine level jump) instead of forcing a smooth ramp. For unit spacing # (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged. # This damps single-year index spikes without flattening genuine trends. # Betas are ordered by calendar year; the baseline year (min_year, implicit # beta=0) has no column, so the penalty spans the non-baseline years only. # For cells with <3 betas there is no curvature to penalise and the solve is # unchanged. n_pen = 0 pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64) pen_vals_arr = pen_b = np.empty(0, dtype=np.float64) if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3: sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA)) years_sorted = sorted(year_to_col) cols_by_year = [year_to_col[y] for y in years_sorted] n_pen = n_cols - 2 pen_rows = np.repeat(n + np.arange(n_pen), 3) pen_cols = np.empty(n_pen * 3, dtype=np.int64) pen_vals = np.empty(n_pen * 3, dtype=np.float64) for k in range(n_pen): pen_cols[3 * k : 3 * k + 3] = ( cols_by_year[k], cols_by_year[k + 1], cols_by_year[k + 2], ) y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2] w0 = 2.0 / ((y1 - y0) * (y2 - y0)) w1 = -2.0 / ((y1 - y0) * (y2 - y1)) w2 = 2.0 / ((y2 - y1) * (y2 - y0)) pen_vals[3 * k : 3 * k + 3] = ( sqrt_lambda * w0, sqrt_lambda * w1, sqrt_lambda * w2, ) pen_rows_arr = pen_rows.astype(np.int64) pen_cols_arr = pen_cols pen_vals_arr = pen_vals pen_b = np.zeros(n_pen, dtype=np.float64) n_total_rows = n + n_pen weights = base_weights.copy() for _ in range(IRLS_ITERATIONS): data = signs_arr * weights[rows_arr] if n_pen: all_data = np.concatenate([data, pen_vals_arr]) all_rows = np.concatenate([rows_arr, pen_rows_arr]) all_cols = np.concatenate([cols_arr, pen_cols_arr]) b = np.concatenate([log_ratios * weights, pen_b]) else: all_data, all_rows, all_cols = data, rows_arr, cols_arr b = log_ratios * weights A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols)) betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0] # Residuals predicted = np.zeros(n) predicted[mask2] += betas[col2[mask2]] predicted[mask1] -= betas[col1[mask1]] residuals = log_ratios - predicted # Huber reweighting abs_r = np.abs(residuals) huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10)) weights = base_weights * huber_w index = {min_year: 0.0} for year, c in year_to_col.items(): index[year] = float(betas[c]) return index def compute_indices_for_level(pairs: pl.DataFrame, group_col: str): """Solve robust indices for each group. Returns (indices, n_pairs) dicts.""" groups = pairs.group_by(group_col).agg( pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"), ) indices = {} n_pairs = {} for row in tqdm( groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}" ): key = row[group_col] y1 = np.array(row["year1"], dtype=np.int32) y2 = np.array(row["year2"], dtype=np.int32) lr = np.array(row["log_ratio"], dtype=np.float64) w = np.array(row["weight"], dtype=np.float64) idx = solve_robust_index(y1, y2, lr, w) if idx: indices[key] = idx # Count only information-bearing pairs: same-year (year1==year2) and # baseline-baseline pairs cancel in the sparse solve and contribute # zero information to the annual index, so including them would # inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors. n_pairs[key] = int(np.count_nonzero(y2 != y1)) return indices, n_pairs def compute_hedonic_index( input_path: Path, min_year: int, max_year: int, max_sale_year: int | None = None, ) -> dict[int, float]: """Quality-adjusted hedonic index: regress log(price) on features, average residual by year. Used as the ultimate shrinkage fallback for the repeat-sales index. If max_sale_year is set, only sales before that year are used (backtesting holdout). """ effective_max = max_sale_year - 1 if max_sale_year is not None else max_year print("Computing hedonic index...") df = ( pl.scan_parquet(input_path) .select( "Last known price", "Date of last transaction", "Property type", "Total floor area (sqm)", ) .filter( pl.col("Last known price").is_not_null(), pl.col("Total floor area (sqm)").is_not_null(), pl.col("Total floor area (sqm)") > 0, ) .with_columns( pl.col("Date of last transaction").dt.year().alias("sale_year"), type_group_expr(), ) .filter( pl.col("type_group").is_not_null(), pl.col("sale_year").is_not_null(), pl.col("sale_year") >= min_year, pl.col("sale_year") <= effective_max, ) .collect() ) print(f" {len(df):,} complete cases for hedonic model") # Target log_price = np.log(df["Last known price"].to_numpy().astype(np.float64)) sale_years = df["sale_year"].to_numpy() # Build feature matrix (5 hedonic features + intercept) X = build_hedonic_features(df) F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)]) print(f" Feature matrix: {F.shape[0]:,} x {F.shape[1]}") # Step 1: regress log(price) on features -> quality score betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0] quality_score = F.astype(np.float64) @ betas residuals = log_price - quality_score # Step 2: average residual by year = hedonic index hedonic = {} for y in range(min_year, max_year + 1): mask = sale_years == y if mask.sum() > 0: hedonic[y] = float(np.mean(residuals[mask])) # Normalize: min_year = 0 base = hedonic.get(min_year, 0.0) for y in hedonic: hedonic[y] -= base print( f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}" ) return hedonic EXTRAPOLATION_YEARS = 3 def forward_fill(index: dict, min_year: int, max_year: int) -> dict: """Forward-fill missing years, with linear extrapolation beyond last known year.""" if not index: return {y: 0.0 for y in range(min_year, max_year + 1)} sorted_years = sorted(index.keys()) last_known_year = sorted_years[-1] # Forward fill up to last known year filled = {} last = 0.0 for y in range(min_year, last_known_year + 1): if y in index: last = index[y] filled[y] = last # Linear extrapolation beyond last known year if last_known_year < max_year: recent = [ (y, index[y]) for y in sorted_years if y >= last_known_year - EXTRAPOLATION_YEARS ] if len(recent) >= 2: years_arr = np.array([r[0] for r in recent], dtype=np.float64) vals_arr = np.array([r[1] for r in recent], dtype=np.float64) slope = np.polyfit(years_arr, vals_arr, 1)[0] for y in range(last_known_year + 1, max_year + 1): filled[y] = index[last_known_year] + slope * (y - last_known_year) else: for y in range(last_known_year + 1, max_year + 1): filled[y] = index[last_known_year] return filled def build_index( input_path: Path, max_pair_year: int | None = None, postcodes_path: Path | None = None, ) -> pl.DataFrame: """Build the full price index from raw data. If max_pair_year is set, only pairs before that year are used (backtesting holdout). The index is still forward-filled to CURRENT_YEAR. postcodes_path: if provided, lat/lon are read from this file instead of input_path. """ # Solve the index only on COMPLETE calendar years: exclude the partial # current year, whose thin repeat-sale set yields wild betas. The index is # still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026 # follows the established trend rather than a partial-year spike. Backtest # passes a stricter max_pair_year, which is honoured. estimation_cap = ( max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1 ) pairs = extract_pairs(input_path, max_year2=estimation_cap) centroids = extract_centroids(postcodes_path or input_path) min_year = int(pairs["year1"].min()) max_year = CURRENT_YEAR hedonic_idx = compute_hedonic_index( input_path, min_year, max_year, max_sale_year=estimation_cap ) # Precompute hierarchy all_sectors = pairs["sector"].unique().to_list() sector_to_dist = {} dist_to_area = {} for s in all_sectors: d, a = hierarchy_keys(s) sector_to_dist[s] = d dist_to_area[d] = a # Process each type group + "All" all_type_groups = ["All"] + TYPE_GROUPS final = {} # {type_group: {sector: {year: log_index}}} final_n = {} # {type_group: {sector: n_pairs}} for tg in all_type_groups: print(f"\n--- {tg} ---") typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg) if len(typed) < MIN_PAIRS: print(f" Skipping (only {len(typed)} pairs)") final[tg] = {s: dict(hedonic_idx) for s in all_sectors} final_n[tg] = {s: 0 for s in all_sectors} continue print(f" {len(typed):,} pairs") # National np_arrs = typed.select("year1", "year2", "log_ratio", "weight") national_idx = solve_robust_index( np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(), np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(), ) national_n = len(typed) print(f" National: {len(national_idx)} years") # Area, district, sector print(" Computing per-level indices:") area_idx, area_n = compute_indices_for_level(typed, "area") district_idx, district_n = compute_indices_for_level(typed, "district") sector_idx, sector_n = compute_indices_for_level(typed, "sector") print( f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors" ) # Shrinkage: national -> hedonic first, then hierarchical. Each cell is # anchored to log-index 0 at its OWN earliest year (solve_robust_index), # so cells with shorter histories sit on a later origin than their wider # parents. Before each blend we lift the child onto its parent's base at # the child's first year (lift_onto_parent) -- otherwise combining them # key-by-key averages level-incompatible numbers. The hedonic fallback is # anchored at the global min_year, so it serves as the base for national. print(" Applying shrinkage...") national_shrunk = shrink_dicts( lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n ) sector_shrunk = hierarchical_shrinkage( sector_idx, sector_n, district_idx, district_n, area_idx, area_n, national_shrunk, all_sectors, sector_to_dist, dist_to_area, shrink_dicts, lift_onto_parent, ) # Spatial smoothing print(" Spatial smoothing...") sector_smoothed = spatial_smooth( sector_shrunk, centroids, sector_n, blend_dicts ) # Forward fill for sec in all_sectors: sector_smoothed[sec] = forward_fill( sector_smoothed.get(sec, hedonic_idx), min_year, max_year ) final[tg] = sector_smoothed final_n[tg] = sector_n # Assemble output print("\nAssembling output...") rows = [] for tg in all_type_groups: for sec in all_sectors: n = final_n[tg].get(sec, 0) for year, log_idx in final[tg][sec].items(): rows.append((sec, tg, year, log_idx, n)) return pl.DataFrame( rows, schema={ "sector": pl.String, "type_group": pl.String, "year": pl.Int32, "log_index": pl.Float64, "n_pairs": pl.Int64, }, orient="row", ).sort("type_group", "sector", "year") def main(): parser = argparse.ArgumentParser( description="Build improved repeat-sales price index" ) parser.add_argument("--input", type=Path, required=True) parser.add_argument( "--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon centroids)", ) parser.add_argument("--output", type=Path, required=True) args = parser.parse_args() result = build_index(args.input, postcodes_path=args.postcodes) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"\nWrote {args.output} ({size_mb:.1f} MB)") print( f" {result['sector'].n_unique():,} sectors x {result['type_group'].n_unique()} types x {result['year'].n_unique()} years = {len(result):,} rows" ) if __name__ == "__main__": main()