lmao

2026-02-15 22:39:49 +00:00 · 2026-02-15 22:39:49 +00:00 · 524580eb25
commit 524580eb25
parent 03445188ea
102 changed files with 36625 additions and 1295 deletions
--- a/pipeline/transform/price_estimation/init.py
+++ b/pipeline/transform/price_estimation/init.py
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -0,0 +1,292 @@
+"""Backtest price estimation on held-out recent sales.
+
+Uses temporal holdout: index built from pairs before TEST_YEAR_MIN only.
+Test set: properties with 2+ sales where the last sale >= TEST_YEAR_MIN.
+Evaluates: Naive vs Index vs kNN vs Blended.
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+from pipeline.transform.price_estimation.index import build_index
+from pipeline.transform.price_estimation.knn import (
+    KNN_BLEND_WEIGHT,
+    build_knn_pool,
+    knn_median_psm,
+)
+from pipeline.transform.price_estimation.utils import (
+    CURRENT_YEAR,
+    MAX_LOG_ADJUSTMENT,
+    compute_seasonal_factors,
+    interpolate_log_index,
+    sector_expr,
+    type_group_expr,
+)
+
+TEST_YEAR_MIN = 2022
+
+
+def extract_test_set(input_path: Path) -> pl.DataFrame:
+    """Extract test pairs: second-to-last sale as input, last sale as ground truth."""
+    print("Loading test set...")
+    df = (
+        pl.scan_parquet(input_path)
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
+        .with_columns(
+            sector_expr(),
+            type_group_expr(),
+            # Last sale (ground truth)
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("year")
+            .alias("actual_year"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("month")
+            .alias("actual_month"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("price")
+            .alias("actual_price"),
+            # Second-to-last sale (input)
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("year")
+            .alias("input_year"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("month")
+            .alias("input_month"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("price")
+            .alias("input_price"),
+        )
+        .with_columns(
+            (
+                pl.col("actual_year").cast(pl.Float64)
+                + (pl.col("actual_month").cast(pl.Float64) - 1.0) / 12.0
+            ).alias("actual_frac_year"),
+            (
+                pl.col("input_year").cast(pl.Float64)
+                + (pl.col("input_month").cast(pl.Float64) - 1.0) / 12.0
+            ).alias("input_frac_year"),
+        )
+        .filter(
+            pl.col("actual_year") >= TEST_YEAR_MIN,
+            pl.col("input_price") > 0,
+            pl.col("actual_price") > 0,
+            pl.col("actual_frac_year") > pl.col("input_frac_year"),
+        )
+        .collect()
+    )
+    print(f"  {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
+    return df
+
+
+def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
+    """Index-based prediction with interpolation, capping, and seasonal adjustment."""
+    test = interpolate_log_index(
+        index, test, "sector", "type_group", "input_frac_year", "log_index_input"
+    )
+    test = interpolate_log_index(
+        index, test, "sector", "type_group", "actual_frac_year", "log_index_actual"
+    )
+
+    test = test.with_columns(
+        (
+            pl.col("input_price").cast(pl.Float64)
+            * (pl.col("log_index_actual") - pl.col("log_index_input"))
+            .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
+            .exp()
+            * pl.col("_seasonal_adj")
+        )
+        .fill_null(pl.col("input_price").cast(pl.Float64))
+        .alias("predicted"),
+    )
+    return test
+
+
+def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
+    valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
+    actual = actual[valid]
+    predicted = predicted[valid]
+
+    ape = np.abs(predicted - actual) / actual
+    signed_err = predicted - actual
+
+    return {
+        "MdAPE (%)": float(np.median(ape) * 100),
+        "% within 10%": float(np.mean(ape <= 0.10) * 100),
+        "% within 20%": float(np.mean(ape <= 0.20) * 100),
+        "% within 30%": float(np.mean(ape <= 0.30) * 100),
+        "MAE (£)": float(np.mean(np.abs(signed_err))),
+        "Mean signed error (£)": float(np.mean(signed_err)),
+        "n": int(len(actual)),
+    }
+
+
+def print_metrics_table(metrics_by_stage: dict):
+    stages = list(metrics_by_stage.keys())
+    col_w = 15
+    width = 25 + col_w * len(stages)
+
+    print("\n" + "=" * width)
+    print(f"BACKTEST RESULTS (holdout: sales >= {TEST_YEAR_MIN})")
+    print("=" * width)
+
+    metric_names = [
+        "MdAPE (%)",
+        "% within 10%",
+        "% within 20%",
+        "% within 30%",
+        "MAE (£)",
+        "Mean signed error (£)",
+        "n",
+    ]
+
+    header = f"{'Metric':<25s}"
+    for stage in stages:
+        header += f" {stage:>{col_w - 1}s}"
+    print(header)
+    print("-" * width)
+
+    for metric in metric_names:
+        row = f"{metric:<25s}"
+        for stage in stages:
+            val = metrics_by_stage[stage][metric]
+            if metric == "n":
+                row += f" {val:>{col_w - 1},d}"
+            elif "£" in metric:
+                row += f" {val:>{col_w - 2},.0f}"
+            else:
+                row += f" {val:>{col_w - 2}.1f}%"
+        print(row)
+
+    print("=" * width)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backtest price estimation model")
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output backtest_results.parquet"
+    )
+    args = parser.parse_args()
+
+    # Build index from pre-test data only (temporal holdout)
+    print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
+    index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
+    print(
+        f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+        f"{index['type_group'].n_unique()} type groups"
+    )
+
+    # Compute seasonal factors from pre-test data only
+    seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
+    months = [
+        "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+    ]
+    print(
+        f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
+    )
+
+    test = extract_test_set(args.input)
+
+    # Compute seasonal adjustment for each test pair
+    input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
+    actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
+    seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
+    test = test.with_columns(
+        pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
+    )
+
+    print("\nPredicting with price index...")
+    test = predict(test, index)
+
+    # --- kNN ---
+    ref_fy = float(TEST_YEAR_MIN)
+    trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
+
+    # Interpolate log_index at reference year for temporal adjustment
+    test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))
+    test = interpolate_log_index(
+        index, test, "sector", "type_group", "_ref_fy", "_log_index_ref"
+    )
+
+    lat = test["lat"].cast(pl.Float64).to_numpy()
+    lon = test["lon"].cast(pl.Float64).to_numpy()
+    tg = test["type_group"].to_numpy()
+    fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
+
+    print("\nComputing kNN estimates...")
+    knn_psm = knn_median_psm(trees, lat, lon, tg)
+
+    # Temporal adjustment: pool PSM is at ref, adjust to actual
+    log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)
+    log_idx_ref = test["_log_index_ref"].to_numpy().astype(np.float64)
+    temporal_adj = np.where(
+        np.isfinite(log_idx_actual) & np.isfinite(log_idx_ref),
+        np.exp(log_idx_actual - log_idx_ref),
+        1.0,
+    )
+    knn_est = knn_psm * fa * temporal_adj
+
+    n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
+    print(f"  kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
+
+    # Blend: (1-w)*index + w*kNN where both available
+    index_est = test["predicted"].to_numpy().astype(np.float64)
+    knn_valid = np.isfinite(knn_est) & (knn_est > 0)
+    blended = np.where(
+        knn_valid & np.isfinite(index_est),
+        (1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
+        np.where(np.isfinite(index_est), index_est, knn_est),
+    )
+
+    actual = test["actual_price"].to_numpy().astype(np.float64)
+
+    metrics = {
+        "Naive": compute_metrics(
+            actual, test["input_price"].to_numpy().astype(np.float64)
+        ),
+        "Index": compute_metrics(actual, index_est),
+        "kNN": compute_metrics(actual, knn_est),
+        "Blended": compute_metrics(actual, blended),
+    }
+
+    print_metrics_table(metrics)
+
+    # Save results
+    result = test.select(
+        "Postcode",
+        "sector",
+        "input_year",
+        "input_frac_year",
+        "input_price",
+        "actual_year",
+        "actual_frac_year",
+        "actual_price",
+        "predicted",
+    ).with_columns(
+        pl.Series("knn_predicted", knn_est, dtype=pl.Float64),
+        pl.Series("blended", blended, dtype=pl.Float64),
+    )
+
+    result.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
+    print(f"  {len(result):,} rows")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -0,0 +1,204 @@
+"""Augment wide.parquet with estimated current prices.
+
+For properties with a known prior sale, applies the repeat-sales price index
+to adjust the last known price to the current date, then blends with kNN
+estimates from nearby recently-sold properties. Includes:
+- Capping extreme index adjustments
+- Seasonal month-of-sale adjustment
+- kNN spatial blending
+
+Modifies wide.parquet in-place.
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+from pipeline.transform.price_estimation.knn import (
+    KNN_BLEND_WEIGHT,
+    build_knn_pool,
+    knn_median_psm,
+)
+from pipeline.transform.price_estimation.utils import (
+    CURRENT_FRAC_YEAR,
+    CURRENT_MONTH,
+    MAX_LOG_ADJUSTMENT,
+    compute_seasonal_factors,
+    interpolate_log_index,
+    sector_expr,
+    type_group_expr,
+)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Augment wide.parquet with estimated current prices"
+    )
+    parser.add_argument(
+        "--input",
+        type=Path,
+        required=True,
+        help="Path to wide.parquet (modified in-place)",
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    args = parser.parse_args()
+
+    print("Loading wide.parquet...")
+    df = pl.read_parquet(args.input)
+    print(f"  {len(df):,} rows, {len(df.columns)} columns")
+
+    # Drop existing estimated columns if re-running
+    for col in ["Estimated current price", "Est. price per sqm"]:
+        if col in df.columns:
+            df = df.drop(col)
+
+    # Compute seasonal factors
+    seasonal = compute_seasonal_factors(args.input)
+    months = [
+        "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+    ]
+    print(
+        f"  Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
+    )
+
+    # Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
+    sale_month = (
+        df["Date of last transaction"]
+        .dt.month()
+        .fill_null(6)
+        .to_numpy()
+        .astype(np.int32)
+    )
+    seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
+
+    # Derive helper columns
+    df = df.with_columns(
+        sector_expr().alias("_sector"),
+        (
+            pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+            + (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
+            / 12.0
+        ).alias("_sale_frac_year"),
+        type_group_expr().alias("_type_group"),
+        pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
+        pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
+    )
+
+    index = pl.read_parquet(args.index)
+    print(
+        f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+        f"{index['type_group'].n_unique()} type groups"
+    )
+
+    print("\nApplying repeat-sales index with fractional year interpolation...")
+
+    df = interpolate_log_index(
+        index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp"
+    )
+    df = interpolate_log_index(
+        index,
+        df,
+        "_sector",
+        "_type_group",
+        "_current_frac_year",
+        "_log_index_current_interp",
+    )
+
+    # Compute index-adjusted estimate with cap and seasonal adjustment
+    has_price = (
+        pl.col("Last known price").is_not_null()
+        & pl.col("Postcode").is_not_null()
+        & pl.col("Date of last transaction").is_not_null()
+    )
+
+    df = df.with_columns(
+        pl.when(has_price)
+        .then(
+            pl.col("Last known price").cast(pl.Float64)
+            * (
+                pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
+            )
+            .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
+            .exp()
+            * pl.col("_seasonal_adj")
+        )
+        .otherwise(pl.lit(None))
+        .alias("Estimated current price"),
+    )
+
+    n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height
+    n_with_price = df.filter(has_price).height
+    print(
+        f"  {n_estimated:,} of {n_with_price:,} properties estimated "
+        f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)"
+    )
+
+    # --- kNN blending ---
+    print("\nBuilding kNN estimates...")
+    trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
+
+    lat = df["lat"].cast(pl.Float64).to_numpy()
+    lon = df["lon"].cast(pl.Float64).to_numpy()
+    tg = df["_type_group"].fill_null("").to_numpy()
+    fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
+
+    knn_psm = knn_median_psm(trees, lat, lon, tg)
+    knn_est = knn_psm * fa  # No temporal adj: ref == current
+
+    df = df.with_columns(
+        pl.Series("_knn_est", knn_est, dtype=pl.Float64),
+    )
+
+    # Blend: where kNN available, use weighted average; else keep index
+    df = df.with_columns(
+        pl.when(
+            pl.col("Estimated current price").is_not_null()
+            & pl.col("_knn_est").is_not_null()
+            & pl.col("_knn_est").is_finite()
+            & (pl.col("_knn_est") > 0)
+        )
+        .then(
+            (1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
+            + KNN_BLEND_WEIGHT * pl.col("_knn_est")
+        )
+        .when(pl.col("Estimated current price").is_not_null())
+        .then(pl.col("Estimated current price"))
+        .otherwise(pl.lit(None))
+        .alias("Estimated current price"),
+    )
+
+    n_blended = df.filter(
+        pl.col("_knn_est").is_not_null()
+        & pl.col("_knn_est").is_finite()
+        & (pl.col("_knn_est") > 0)
+        & pl.col("Estimated current price").is_not_null()
+    ).height
+    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")
+
+    # Derive estimated price per sqm where both estimated price and floor area exist
+    df = df.with_columns(
+        (pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
+        .round(0)
+        .cast(pl.Int32, strict=False)
+        .alias("Est. price per sqm"),
+    )
+
+    # Drop all temporary columns
+    temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
+    df = df.drop(temp_cols)
+
+    df.write_parquet(args.input)
+    size_mb = args.input.stat().st_size / (1024 * 1024)
+    print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
+    print(
+        f"  {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -0,0 +1,465 @@
+"""Hierarchical repeat-sales price index.
+
+Stratified by property type and postcode sector, with IRLS Huber regression,
+hierarchical shrinkage (sector → district → area → national → hedonic),
+and KD-tree spatial smoothing for sparse sectors.
+
+Output: price_index.parquet — sector x type_group x year -> log_index
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.sparse import csc_matrix
+from scipy.sparse.linalg import lsqr
+from tqdm import tqdm
+
+from pipeline.transform.price_estimation.shrinkage import (
+    blend_dicts,
+    hierarchical_shrinkage,
+    shrink_dicts,
+    spatial_smooth,
+)
+from pipeline.transform.price_estimation.utils import (
+    CURRENT_YEAR,
+    TYPE_GROUPS,
+    build_hedonic_features,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
+MIN_PAIRS = 5
+OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
+HUBER_K = 1.345
+IRLS_ITERATIONS = 5
+
+
+def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFrame:
+    """Extract consecutive repeat-sale pairs.
+
+    If max_year2 is set, only pairs where year2 < max_year2 are included
+    (for temporal holdout in backtesting).
+    """
+    print("Extracting repeat-sale pairs...")
+    df = (
+        pl.scan_parquet(input_path)
+        .select("Postcode", "historical_prices", "Property type")
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
+        .with_columns(sector_expr(), type_group_expr())
+        .collect()
+    )
+    print(f"  {len(df):,} properties with 2+ transactions")
+
+    pairs = (
+        df.lazy()
+        .with_columns(
+            pl.col("historical_prices")
+            .list.slice(0, pl.col("historical_prices").list.len() - 1)
+            .alias("from_txn"),
+            pl.col("historical_prices").list.slice(1).alias("to_txn"),
+        )
+        .explode("from_txn", "to_txn")
+        .with_columns(
+            pl.col("from_txn").struct.field("year").alias("year1"),
+            pl.col("from_txn").struct.field("month").alias("month1"),
+            pl.col("from_txn").struct.field("price").alias("price1"),
+            pl.col("to_txn").struct.field("year").alias("year2"),
+            pl.col("to_txn").struct.field("month").alias("month2"),
+            pl.col("to_txn").struct.field("price").alias("price2"),
+        )
+        .with_columns(
+            (
+                pl.col("year1").cast(pl.Float64)
+                + (pl.col("month1").cast(pl.Float64) - 1.0) / 12.0
+            ).alias("frac_year1"),
+            (
+                pl.col("year2").cast(pl.Float64)
+                + (pl.col("month2").cast(pl.Float64) - 1.0) / 12.0
+            ).alias("frac_year2"),
+        )
+        .select(
+            "sector",
+            "type_group",
+            "year1",
+            "price1",
+            "year2",
+            "price2",
+            "frac_year1",
+            "frac_year2",
+        )
+        .filter(
+            pl.col("price1") > 0,
+            pl.col("price2") > 0,
+            pl.col("frac_year2") > pl.col("frac_year1"),
+        )
+        .with_columns(
+            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
+            .log()
+            .alias("log_ratio"),
+            (
+                1.0
+                / (pl.col("frac_year2") - pl.col("frac_year1"))
+                .cast(pl.Float64)
+                .sqrt()
+            ).alias("weight"),
+        )
+        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
+        .collect()
+    )
+
+    if max_year2 is not None:
+        pairs = pairs.filter(pl.col("year2") < max_year2)
+
+    # Add hierarchy columns
+    pairs = pairs.with_columns(
+        pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
+    ).with_columns(
+        pl.col("district").str.replace(r"\d.*$", "").alias("area"),
+    )
+
+    print(f"  {len(pairs):,} pairs extracted")
+    return pairs
+
+
+def solve_robust_index(
+    years1: np.ndarray,
+    years2: np.ndarray,
+    log_ratios: np.ndarray,
+    base_weights: np.ndarray,
+) -> dict[int, float]:
+    """IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
+    n = len(years1)
+    if n < MIN_PAIRS:
+        return {}
+
+    all_years = np.union1d(years1, years2)
+    min_year = int(all_years.min())
+
+    col = 0
+    year_to_col = {}
+    for y in all_years:
+        iy = int(y)
+        if iy != min_year:
+            year_to_col[iy] = col
+            col += 1
+    n_cols = len(year_to_col)
+    if n_cols == 0:
+        return {}
+
+    # Vectorized column index mapping
+    col2 = np.full(n, -1, dtype=np.int32)
+    col1 = np.full(n, -1, dtype=np.int32)
+    for year, c in year_to_col.items():
+        col2[years2 == year] = c
+        col1[years1 == year] = c
+
+    # Sparse matrix structure (fixed across iterations)
+    mask2 = col2 >= 0
+    mask1 = col1 >= 0
+    rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
+    cols_arr = np.concatenate([col2[mask2], col1[mask1]])
+    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
+
+    weights = base_weights.copy()
+
+    for _ in range(IRLS_ITERATIONS):
+        data = signs_arr * weights[rows_arr]
+        A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
+        b = log_ratios * weights
+        betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
+
+        # Residuals
+        predicted = np.zeros(n)
+        predicted[mask2] += betas[col2[mask2]]
+        predicted[mask1] -= betas[col1[mask1]]
+        residuals = log_ratios - predicted
+
+        # Huber reweighting
+        abs_r = np.abs(residuals)
+        huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
+        weights = base_weights * huber_w
+
+    index = {min_year: 0.0}
+    for year, c in year_to_col.items():
+        index[year] = float(betas[c])
+    return index
+
+
+def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
+    """Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
+    groups = pairs.group_by(group_col).agg(
+        pl.col("year1"),
+        pl.col("year2"),
+        pl.col("log_ratio"),
+        pl.col("weight"),
+    )
+    indices = {}
+    n_pairs = {}
+    for row in tqdm(
+        groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"
+    ):
+        key = row[group_col]
+        y1 = np.array(row["year1"], dtype=np.int32)
+        y2 = np.array(row["year2"], dtype=np.int32)
+        lr = np.array(row["log_ratio"], dtype=np.float64)
+        w = np.array(row["weight"], dtype=np.float64)
+        idx = solve_robust_index(y1, y2, lr, w)
+        if idx:
+            indices[key] = idx
+            n_pairs[key] = len(y1)
+    return indices, n_pairs
+
+
+def compute_hedonic_index(
+    input_path: Path,
+    min_year: int,
+    max_year: int,
+    max_sale_year: int | None = None,
+) -> dict[int, float]:
+    """Quality-adjusted hedonic index: regress log(price) on features, average residual by year.
+
+    Used as the ultimate shrinkage fallback for the repeat-sales index.
+    If max_sale_year is set, only sales before that year are used (backtesting holdout).
+    """
+    effective_max = max_sale_year - 1 if max_sale_year is not None else max_year
+    print("Computing hedonic index...")
+    df = (
+        pl.scan_parquet(input_path)
+        .select(
+            "Last known price",
+            "Date of last transaction",
+            "Property type",
+            "Total floor area (sqm)",
+        )
+        .filter(
+            pl.col("Last known price").is_not_null(),
+            pl.col("Total floor area (sqm)").is_not_null(),
+            pl.col("Total floor area (sqm)") > 0,
+        )
+        .with_columns(
+            pl.col("Date of last transaction").dt.year().alias("sale_year"),
+            type_group_expr(),
+        )
+        .filter(
+            pl.col("type_group").is_not_null(),
+            pl.col("sale_year").is_not_null(),
+            pl.col("sale_year") >= min_year,
+            pl.col("sale_year") <= effective_max,
+        )
+        .collect()
+    )
+    print(f"  {len(df):,} complete cases for hedonic model")
+
+    # Target
+    log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
+    sale_years = df["sale_year"].to_numpy()
+
+    # Build feature matrix (5 hedonic features + intercept)
+    X = build_hedonic_features(df)
+    F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
+    print(f"  Feature matrix: {F.shape[0]:,} x {F.shape[1]}")
+
+    # Step 1: regress log(price) on features -> quality score
+    betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
+    quality_score = F.astype(np.float64) @ betas
+    residuals = log_price - quality_score
+
+    # Step 2: average residual by year = hedonic index
+    hedonic = {}
+    for y in range(min_year, max_year + 1):
+        mask = sale_years == y
+        if mask.sum() > 0:
+            hedonic[y] = float(np.mean(residuals[mask]))
+
+    # Normalize: min_year = 0
+    base = hedonic.get(min_year, 0.0)
+    for y in hedonic:
+        hedonic[y] -= base
+
+    print(
+        f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
+    )
+    return hedonic
+
+
+EXTRAPOLATION_YEARS = 3
+
+
+def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
+    """Forward-fill missing years, with linear extrapolation beyond last known year."""
+    if not index:
+        return {y: 0.0 for y in range(min_year, max_year + 1)}
+
+    sorted_years = sorted(index.keys())
+    last_known_year = sorted_years[-1]
+
+    # Forward fill up to last known year
+    filled = {}
+    last = 0.0
+    for y in range(min_year, last_known_year + 1):
+        if y in index:
+            last = index[y]
+        filled[y] = last
+
+    # Linear extrapolation beyond last known year
+    if last_known_year < max_year:
+        recent = [
+            (y, index[y])
+            for y in sorted_years
+            if y >= last_known_year - EXTRAPOLATION_YEARS
+        ]
+        if len(recent) >= 2:
+            years_arr = np.array([r[0] for r in recent], dtype=np.float64)
+            vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
+            slope = np.polyfit(years_arr, vals_arr, 1)[0]
+            for y in range(last_known_year + 1, max_year + 1):
+                filled[y] = index[last_known_year] + slope * (y - last_known_year)
+        else:
+            for y in range(last_known_year + 1, max_year + 1):
+                filled[y] = index[last_known_year]
+
+    return filled
+
+
+def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
+    """Build the full price index from raw data.
+
+    If max_pair_year is set, only pairs before that year are used (backtesting holdout).
+    The index is still forward-filled to CURRENT_YEAR.
+    """
+    pairs = extract_pairs(input_path, max_year2=max_pair_year)
+    centroids = extract_centroids(input_path)
+
+    min_year = int(pairs["year1"].min())
+    max_year = CURRENT_YEAR
+
+    hedonic_idx = compute_hedonic_index(
+        input_path, min_year, max_year, max_sale_year=max_pair_year
+    )
+
+    # Precompute hierarchy
+    all_sectors = pairs["sector"].unique().to_list()
+    sector_to_dist = {}
+    dist_to_area = {}
+    for s in all_sectors:
+        d, a = hierarchy_keys(s)
+        sector_to_dist[s] = d
+        dist_to_area[d] = a
+
+    # Process each type group + "All"
+    all_type_groups = ["All"] + TYPE_GROUPS
+    final = {}  # {type_group: {sector: {year: log_index}}}
+    final_n = {}  # {type_group: {sector: n_pairs}}
+
+    for tg in all_type_groups:
+        print(f"\n--- {tg} ---")
+        typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
+        if len(typed) < MIN_PAIRS:
+            print(f"  Skipping (only {len(typed)} pairs)")
+            final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
+            final_n[tg] = {s: 0 for s in all_sectors}
+            continue
+
+        print(f"  {len(typed):,} pairs")
+
+        # National
+        np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
+        national_idx = solve_robust_index(
+            np_arrs["year1"].to_numpy(),
+            np_arrs["year2"].to_numpy(),
+            np_arrs["log_ratio"].to_numpy(),
+            np_arrs["weight"].to_numpy(),
+        )
+        national_n = len(typed)
+        print(f"  National: {len(national_idx)} years")
+
+        # Area, district, sector
+        print("  Computing per-level indices:")
+        area_idx, area_n = compute_indices_for_level(typed, "area")
+        district_idx, district_n = compute_indices_for_level(typed, "district")
+        sector_idx, sector_n = compute_indices_for_level(typed, "sector")
+        print(
+            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
+        )
+
+        # Shrinkage: national -> hedonic first, then hierarchical
+        print("  Applying shrinkage...")
+        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
+        sector_shrunk = hierarchical_shrinkage(
+            sector_idx,
+            sector_n,
+            district_idx,
+            district_n,
+            area_idx,
+            area_n,
+            national_shrunk,
+            all_sectors,
+            sector_to_dist,
+            dist_to_area,
+            shrink_dicts,
+        )
+
+        # Spatial smoothing
+        print("  Spatial smoothing...")
+        sector_smoothed = spatial_smooth(
+            sector_shrunk, centroids, sector_n, blend_dicts
+        )
+
+        # Forward fill
+        for sec in all_sectors:
+            sector_smoothed[sec] = forward_fill(
+                sector_smoothed.get(sec, hedonic_idx), min_year, max_year
+            )
+
+        final[tg] = sector_smoothed
+        final_n[tg] = sector_n
+
+    # Assemble output
+    print("\nAssembling output...")
+    rows = []
+    for tg in all_type_groups:
+        for sec in all_sectors:
+            n = final_n[tg].get(sec, 0)
+            for year, log_idx in final[tg][sec].items():
+                rows.append((sec, tg, year, log_idx, n))
+
+    return pl.DataFrame(
+        rows,
+        schema={
+            "sector": pl.String,
+            "type_group": pl.String,
+            "year": pl.Int32,
+            "log_index": pl.Float64,
+            "n_pairs": pl.Int64,
+        },
+        orient="row",
+    ).sort("type_group", "sector", "year")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build improved repeat-sales price index"
+    )
+    parser.add_argument("--input", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    args = parser.parse_args()
+
+    result = build_index(args.input)
+
+    result.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
+    print(
+        f"  {result['sector'].n_unique():,} sectors x {result['type_group'].n_unique()} types x {result['year'].n_unique()} years = {len(result):,} rows"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -0,0 +1,161 @@
+"""kNN price estimation using nearby recently-sold properties.
+
+For each target property, finds k nearest sold properties of the same type,
+computes the median index-adjusted price-per-sqm, and multiplies by the
+target's floor area to produce an estimate.
+"""
+
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import KDTree
+
+from pipeline.transform.price_estimation.utils import (
+    TYPE_GROUPS,
+    interpolate_log_index,
+    sector_expr,
+    type_group_expr,
+)
+
+KNN_K = 20
+KNN_MIN_NEIGHBORS = 5
+KNN_BLEND_WEIGHT = 0.35
+
+
+def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
+    """Equirectangular projection: scale lon by cos(lat) for approximate distances."""
+    return np.column_stack([lat, lon * np.cos(np.radians(lat))])
+
+
+def build_knn_pool(
+    input_path: Path,
+    index: pl.DataFrame,
+    ref_frac_year: float,
+    max_sale_year: int | None = None,
+) -> dict[str, tuple[KDTree, np.ndarray]]:
+    """Build per-type_group KD-trees of index-adjusted price-per-sqm.
+
+    Adjusts all pool properties' sale prices to ref_frac_year using the index,
+    then builds a KD-tree per type_group for nearest-neighbor queries.
+
+    Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
+    """
+    print("Building kNN pool...")
+    query = (
+        pl.scan_parquet(input_path)
+        .select(
+            "Postcode",
+            "Property type",
+            "lat",
+            "lon",
+            "Total floor area (sqm)",
+            "Last known price",
+            "Date of last transaction",
+        )
+        .filter(
+            pl.col("lat").is_not_null(),
+            pl.col("lon").is_not_null(),
+            pl.col("Total floor area (sqm)").is_not_null(),
+            pl.col("Total floor area (sqm)") > 0,
+            pl.col("Last known price").is_not_null(),
+            pl.col("Last known price") > 0,
+            pl.col("Postcode").is_not_null(),
+            pl.col("Date of last transaction").is_not_null(),
+        )
+    )
+    if max_sale_year is not None:
+        query = query.filter(
+            pl.col("Date of last transaction").dt.year() < max_sale_year
+        )
+
+    pool = (
+        query.with_columns(
+            sector_expr(),
+            type_group_expr(),
+            (
+                pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+                + (
+                    pl.col("Date of last transaction").dt.month().cast(pl.Float64)
+                    - 1.0
+                )
+                / 12.0
+            ).alias("_sale_fy"),
+            pl.lit(ref_frac_year).alias("_ref_fy"),
+        ).collect()
+    )
+    pool = pool.filter(pl.col("type_group").is_not_null())
+    print(f"  {len(pool):,} pool properties with lat/lon, floor area, price")
+
+    # Interpolate log_index at sale date and reference date
+    pool = interpolate_log_index(
+        index, pool, "sector", "type_group", "_sale_fy", "_li_sale"
+    )
+    pool = interpolate_log_index(
+        index, pool, "sector", "type_group", "_ref_fy", "_li_ref"
+    )
+
+    # adjusted_psm = price / floor_area * exp(log_index_ref - log_index_sale)
+    pool = pool.with_columns(
+        (
+            pl.col("Last known price").cast(pl.Float64)
+            / pl.col("Total floor area (sqm)").cast(pl.Float64)
+            * (pl.col("_li_ref") - pl.col("_li_sale")).exp()
+        ).alias("_adj_psm")
+    ).filter(
+        pl.col("_adj_psm").is_not_null(),
+        pl.col("_adj_psm").is_finite(),
+        pl.col("_adj_psm") > 0,
+    )
+    print(f"  {len(pool):,} after index adjustment")
+
+    # Build per-type KD-trees
+    trees: dict[str, tuple[KDTree, np.ndarray]] = {}
+    for tg in TYPE_GROUPS:
+        sub = pool.filter(pl.col("type_group") == tg)
+        n = len(sub)
+        if n < KNN_MIN_NEIGHBORS:
+            continue
+        lat = sub["lat"].to_numpy().astype(np.float64)
+        lon = sub["lon"].to_numpy().astype(np.float64)
+        psm = sub["_adj_psm"].to_numpy().astype(np.float64)
+        tree = KDTree(_scale_coords(lat, lon))
+        trees[tg] = (tree, psm)
+        print(f"    {tg}: {n:,}")
+
+    return trees
+
+
+def knn_median_psm(
+    trees: dict[str, tuple[KDTree, np.ndarray]],
+    lat: np.ndarray,
+    lon: np.ndarray,
+    type_groups: np.ndarray,
+    k: int = KNN_K,
+) -> np.ndarray:
+    """Return median adjusted-PSM of k nearest neighbours for each target.
+
+    PSM is at the reference date used when building the pool.
+    NaN where not computable (missing coords, unknown type, too few neighbors).
+    """
+    n = len(lat)
+    result = np.full(n, np.nan)
+
+    for tg, (tree, psm) in trees.items():
+        mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
+        idx = np.where(mask)[0]
+        if len(idx) == 0:
+            continue
+
+        actual_k = min(k, len(psm))
+        if actual_k < KNN_MIN_NEIGHBORS:
+            continue
+
+        coords = _scale_coords(lat[idx], lon[idx])
+        _, nn_idx = tree.query(coords, k=actual_k)
+        if nn_idx.ndim == 1:
+            nn_idx = nn_idx.reshape(-1, 1)
+
+        result[idx] = np.nanmedian(psm[nn_idx], axis=1)
+
+    return result
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -0,0 +1,140 @@
+"""Hierarchical shrinkage and spatial smoothing for sector-level estimates."""
+
+from typing import Callable, TypeVar
+
+import numpy as np
+from scipy.spatial import KDTree
+
+from pipeline.transform.price_estimation.utils import SHRINKAGE_K
+
+V = TypeVar("V")
+
+SPATIAL_NEIGHBORS = 5
+SPATIAL_BLEND_K = 30
+
+
+def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
+    """Shrink dict values toward parent using n/(n+k) weighting.
+
+    Works for any dict keyed by year or category.
+    """
+    w = n / (n + SHRINKAGE_K)
+    result = {}
+    for key in set(raw) | set(parent):
+        r = raw.get(key, parent.get(key, 0.0))
+        p = parent.get(key, raw.get(key, 0.0))
+        result[key] = w * r + (1 - w) * p
+    return result
+
+
+def hierarchical_shrinkage(
+    sector_vals: dict[str, V],
+    sector_n: dict[str, int],
+    district_vals: dict[str, V],
+    district_n: dict[str, int],
+    area_vals: dict[str, V],
+    area_n: dict[str, int],
+    top_level: V,
+    all_sectors: list[str],
+    sector_to_dist: dict[str, str],
+    dist_to_area: dict[str, str],
+    shrink_fn: Callable[[V, V, int], V],
+) -> dict[str, V]:
+    """Top-down hierarchical shrinkage: area->top, district->area, sector->district.
+
+    `top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
+    or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
+    """
+    # Area -> top level
+    area_shrunk = {}
+    for area, val in area_vals.items():
+        area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
+
+    # District -> area
+    district_shrunk = {}
+    for dist, val in district_vals.items():
+        a = dist_to_area.get(dist, "")
+        parent = area_shrunk.get(a, top_level)
+        district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
+
+    # Sector -> district
+    sector_shrunk = {}
+    for sec, val in sector_vals.items():
+        d = sector_to_dist.get(sec, "")
+        parent = district_shrunk.get(d, top_level)
+        sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
+
+    # Fill sectors without their own values
+    for sec in all_sectors:
+        if sec not in sector_shrunk:
+            d = sector_to_dist.get(sec, "")
+            a = dist_to_area.get(d, "")
+            sector_shrunk[sec] = district_shrunk.get(d, area_shrunk.get(a, top_level))
+
+    return sector_shrunk
+
+
+def spatial_smooth(
+    sector_values: dict[str, V],
+    centroids: dict[str, tuple[float, float]],
+    counts: dict[str, int],
+    blend_fn: Callable[[V, list[V], float, list[float]], V],
+) -> dict[str, V]:
+    """Blend sparse sector values with K nearest neighbors via KDTree."""
+    sectors_with_coords = [s for s in sector_values if s in centroids]
+    if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
+        return sector_values
+
+    coords = np.array([centroids[s] for s in sectors_with_coords])
+    # Scale longitude by cos(mean_lat) for approximate Euclidean distance
+    mean_lat = np.mean(coords[:, 0])
+    scale = np.cos(np.radians(mean_lat))
+    scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
+    tree = KDTree(scaled_coords)
+
+    result = dict(sector_values)
+    for i, sec in enumerate(sectors_with_coords):
+        n = counts.get(sec, 0)
+        self_w = n / (n + SPATIAL_BLEND_K)
+        if self_w > 0.95:
+            continue  # enough data, skip smoothing
+
+        dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
+        # Skip self (index 0, distance ~0)
+        neighbor_dists = dists[1:]
+        neighbor_idxs = idxs[1:]
+
+        inv_dists = []
+        neighbor_vals = []
+        for d, j in zip(neighbor_dists, neighbor_idxs):
+            ns = sectors_with_coords[j]
+            if d > 0 and ns in sector_values:
+                inv_dists.append(1.0 / d)
+                neighbor_vals.append(sector_values[ns])
+
+        if not neighbor_vals:
+            continue
+
+        total_inv = sum(inv_dists)
+        nbr_w = 1.0 - self_w
+        neighbor_ws = [iw / total_inv * nbr_w for iw in inv_dists]
+
+        result[sec] = blend_fn(sector_values[sec], neighbor_vals, self_w, neighbor_ws)
+
+    return result
+
+
+def blend_dicts(
+    self_val: dict, neighbor_vals: list[dict], self_w: float, neighbor_ws: list[float]
+) -> dict:
+    """Blend dict values by weighted sum across all keys."""
+    all_keys: set = set(self_val)
+    for nv in neighbor_vals:
+        all_keys |= set(nv)
+    result = {}
+    for k in all_keys:
+        val = self_w * self_val.get(k, 0.0)
+        for nv, w in zip(neighbor_vals, neighbor_ws):
+            val += w * nv.get(k, 0.0)
+        result[k] = val
+    return result
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -0,0 +1,233 @@
+"""Shared utilities for price estimation modules."""
+
+from datetime import date
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+CURRENT_YEAR = 2026
+_today = date.today()
+CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
+CURRENT_MONTH = _today.month
+
+# Cap on log(index_ratio) to prevent wild estimates from thin sectors
+MAX_LOG_ADJUSTMENT = 3.0  # ~20x max price change
+TERRACE_TYPES = [
+    "Mid-Terrace",
+    "End-Terrace",
+    "Enclosed Mid-Terrace",
+    "Enclosed End-Terrace",
+    "Terraced",
+]
+FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
+TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
+SHRINKAGE_K = 50
+
+
+def type_group_expr():
+    """Polars expression: Property type -> type_group."""
+    return (
+        pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
+        .then(pl.lit("Terraced"))
+        .when(pl.col("Property type").is_in(FLAT_TYPES))
+        .then(pl.lit("Flats"))
+        .when(pl.col("Property type") == "Bungalow")
+        .then(pl.lit("Bungalow"))
+        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
+        .then(pl.col("Property type"))
+        .otherwise(pl.lit(None))
+        .alias("type_group")
+    )
+
+
+def sector_expr():
+    """Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
+    return (
+        pl.col("Postcode")
+        .str.slice(0, pl.col("Postcode").str.len_chars() - 2)
+        .str.strip_chars()
+        .alias("sector")
+    )
+
+
+def hierarchy_keys(sector: str) -> tuple[str, str]:
+    """Return (district, area) for a sector string."""
+    district = sector.rsplit(" ", 1)[0] if " " in sector else sector
+    area = ""
+    for ch in district:
+        if ch.isalpha():
+            area += ch
+        else:
+            break
+    return district, area
+
+
+NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
+
+
+def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
+    """Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached)."""
+    fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
+    log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
+    tg = df["type_group"].to_numpy()
+    parts = [log_fa]
+    for t in NON_REF_TYPES:
+        parts.append((tg == t).astype(np.float32).reshape(-1, 1))
+    return np.hstack(parts)
+
+
+def interpolate_log_index(
+    index: pl.DataFrame,
+    df: pl.DataFrame,
+    sector_col: str,
+    type_col: str,
+    frac_year_col: str,
+    output_alias: str,
+) -> pl.DataFrame:
+    """Join and interpolate log_index at fractional years.
+
+    For frac_year 2019.75: joins index at year=2019 and year=2020,
+    then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020.
+    Falls back to floor or ceil when the other is missing.
+    """
+    floor_col = f"_{output_alias}_floor"
+    ceil_col = f"_{output_alias}_ceil"
+    floor_year = f"_{output_alias}_floor_year"
+    ceil_year = f"_{output_alias}_ceil_year"
+    frac_col = f"_{output_alias}_frac"
+
+    df = df.with_columns(
+        pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year),
+        pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year),
+        (pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col),
+    )
+
+    df = join_type_stratified_index(
+        df, index, sector_col, type_col, floor_year, floor_col
+    )
+    df = join_type_stratified_index(
+        df, index, sector_col, type_col, ceil_year, ceil_col
+    )
+
+    # Interpolate: (1-frac)*floor + frac*ceil, with fallbacks
+    df = df.with_columns(
+        pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null())
+        .then(
+            (1.0 - pl.col(frac_col)) * pl.col(floor_col)
+            + pl.col(frac_col) * pl.col(ceil_col)
+        )
+        .when(pl.col(floor_col).is_not_null())
+        .then(pl.col(floor_col))
+        .when(pl.col(ceil_col).is_not_null())
+        .then(pl.col(ceil_col))
+        .otherwise(pl.lit(None))
+        .alias(output_alias),
+    ).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col)
+
+    return df
+
+
+def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
+    """Compute mean lat/lon per postcode sector."""
+    print("Computing sector centroids...")
+    df = (
+        pl.scan_parquet(input_path)
+        .select("Postcode", "lat", "lon")
+        .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
+        .with_columns(sector_expr())
+        .group_by("sector")
+        .agg(pl.col("lat").mean(), pl.col("lon").mean())
+        .collect()
+    )
+    centroids = {}
+    for row in df.iter_rows(named=True):
+        centroids[row["sector"]] = (row["lat"], row["lon"])
+    print(f"  {len(centroids):,} sector centroids")
+    return centroids
+
+
+def join_type_stratified_index(
+    df: pl.DataFrame,
+    index: pl.DataFrame,
+    sector_col: str,
+    type_col: str,
+    year_col: str,
+    output_alias: str,
+) -> pl.DataFrame:
+    """Join price index with typed->All fallback. Returns df with `output_alias` column."""
+    idx_typed = index.filter(pl.col("type_group") != "All")
+    idx_all = index.filter(pl.col("type_group") == "All")
+
+    _typed = f"_{output_alias}_typed"
+    _all = f"_{output_alias}_all"
+
+    df = df.join(
+        idx_typed.select(
+            "sector", "type_group", "year", pl.col("log_index").alias(_typed)
+        ),
+        left_on=[sector_col, type_col, year_col],
+        right_on=["sector", "type_group", "year"],
+        how="left",
+    ).join(
+        idx_all.select("sector", "year", pl.col("log_index").alias(_all)),
+        left_on=[sector_col, year_col],
+        right_on=["sector", "year"],
+        how="left",
+    )
+
+    df = df.with_columns(
+        pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias),
+    ).drop(_typed, _all)
+
+    return df
+
+
+def compute_seasonal_factors(
+    input_path: Path, max_sale_year: int | None = None
+) -> np.ndarray:
+    """Compute 12 multiplicative monthly price factors from price-per-sqm.
+
+    Detrends by normalizing median £/sqm within each year, then averages
+    across years. Returns array of 12 factors (index 0 = January).
+    Normalized so mean = 1.0.
+    """
+    query = (
+        pl.scan_parquet(input_path)
+        .select("Last known price", "Total floor area (sqm)", "Date of last transaction")
+        .filter(
+            pl.col("Last known price").is_not_null(),
+            pl.col("Last known price") > 0,
+            pl.col("Total floor area (sqm)").is_not_null(),
+            pl.col("Total floor area (sqm)") > 0,
+            pl.col("Date of last transaction").is_not_null(),
+        )
+        .with_columns(
+            (
+                pl.col("Last known price").cast(pl.Float64)
+                / pl.col("Total floor area (sqm)").cast(pl.Float64)
+            ).alias("psm"),
+            pl.col("Date of last transaction").dt.month().alias("month"),
+            pl.col("Date of last transaction").dt.year().alias("year"),
+        )
+    )
+    if max_sale_year is not None:
+        query = query.filter(pl.col("year") < max_sale_year)
+
+    monthly = (
+        query.group_by("year", "month")
+        .agg(pl.col("psm").median().alias("median_psm"))
+        .with_columns(
+            pl.col("median_psm").mean().over("year").alias("year_mean"),
+        )
+        .with_columns(
+            (pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
+        )
+        .group_by("month")
+        .agg(pl.col("ratio").mean().alias("factor"))
+        .sort("month")
+        .collect()
+    )
+
+    factors = monthly["factor"].to_numpy().astype(np.float64)
+    return factors / factors.mean()