perfect-postcode/pipeline/transform/price_estimation/index.py

"""Hierarchical repeat-sales price index.

Stratified by property type and postcode sector, with IRLS Huber regression,
hierarchical shrinkage (sector → district → area → national → hedonic),
and KD-tree spatial smoothing for sparse sectors.

Output: price_index.parquet — sector x type_group x year -> log_index
"""

import argparse
from pathlib import Path

import numpy as np
import polars as pl
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import lsqr
from tqdm import tqdm

from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
    shrink_dicts,
    spatial_smooth,
)
from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
    TYPE_GROUPS,
    build_hedonic_features,
    extract_centroids,
    hierarchy_keys,
    sector_expr,
    type_group_expr,
)

MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5


def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFrame:
    """Extract consecutive repeat-sale pairs.

    If max_year2 is set, only pairs where year2 < max_year2 are included
    (for temporal holdout in backtesting).
    """
    print("Extracting repeat-sale pairs...")
    df = (
        pl.scan_parquet(input_path)
        .select("Postcode", "historical_prices", "Property type")
        .filter(
            pl.col("Postcode").is_not_null(),
            pl.col("historical_prices").list.len() >= 2,
        )
        .with_columns(sector_expr(), type_group_expr())
        .collect()
    )
    print(f"  {len(df):,} properties with 2+ transactions")

    pairs = (
        df.lazy()
        .with_columns(
            pl.col("historical_prices")
            .list.slice(0, pl.col("historical_prices").list.len() - 1)
            .alias("from_txn"),
            pl.col("historical_prices").list.slice(1).alias("to_txn"),
        )
        .explode("from_txn", "to_txn")
        .with_columns(
            pl.col("from_txn").struct.field("year").alias("year1"),
            pl.col("from_txn").struct.field("month").alias("month1"),
            pl.col("from_txn").struct.field("price").alias("price1"),
            pl.col("to_txn").struct.field("year").alias("year2"),
            pl.col("to_txn").struct.field("month").alias("month2"),
            pl.col("to_txn").struct.field("price").alias("price2"),
        )
        .with_columns(
            (
                pl.col("year1").cast(pl.Float64)
                + (pl.col("month1").cast(pl.Float64) - 1.0) / 12.0
            ).alias("frac_year1"),
            (
                pl.col("year2").cast(pl.Float64)
                + (pl.col("month2").cast(pl.Float64) - 1.0) / 12.0
            ).alias("frac_year2"),
        )
        .select(
            "sector",
            "type_group",
            "year1",
            "price1",
            "year2",
            "price2",
            "frac_year1",
            "frac_year2",
        )
        .filter(
            pl.col("price1") > 0,
            pl.col("price2") > 0,
            pl.col("frac_year2") > pl.col("frac_year1"),
        )
        .with_columns(
            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
            .log()
            .alias("log_ratio"),
            (
                1.0
                / (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
            ).alias("weight"),
        )
        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
        .collect()
    )

    if max_year2 is not None:
        pairs = pairs.filter(pl.col("year2") < max_year2)

    # Add hierarchy columns
    pairs = pairs.with_columns(
        pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
    ).with_columns(
        pl.col("district").str.replace(r"\d.*$", "").alias("area"),
    )

    print(f"  {len(pairs):,} pairs extracted")
    return pairs


def solve_robust_index(
    years1: np.ndarray,
    years2: np.ndarray,
    log_ratios: np.ndarray,
    base_weights: np.ndarray,
) -> dict[int, float]:
    """IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
    n = len(years1)
    if n < MIN_PAIRS:
        return {}

    all_years = np.union1d(years1, years2)
    min_year = int(all_years.min())

    col = 0
    year_to_col = {}
    for y in all_years:
        iy = int(y)
        if iy != min_year:
            year_to_col[iy] = col
            col += 1
    n_cols = len(year_to_col)
    if n_cols == 0:
        return {}

    # Vectorized column index mapping
    col2 = np.full(n, -1, dtype=np.int32)
    col1 = np.full(n, -1, dtype=np.int32)
    for year, c in year_to_col.items():
        col2[years2 == year] = c
        col1[years1 == year] = c

    # Sparse matrix structure (fixed across iterations)
    mask2 = col2 >= 0
    mask1 = col1 >= 0
    rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
    cols_arr = np.concatenate([col2[mask2], col1[mask1]])
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

    weights = base_weights.copy()

    for _ in range(IRLS_ITERATIONS):
        data = signs_arr * weights[rows_arr]
        A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
        b = log_ratios * weights
        betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]

        # Residuals
        predicted = np.zeros(n)
        predicted[mask2] += betas[col2[mask2]]
        predicted[mask1] -= betas[col1[mask1]]
        residuals = log_ratios - predicted

        # Huber reweighting
        abs_r = np.abs(residuals)
        huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
        weights = base_weights * huber_w

    index = {min_year: 0.0}
    for year, c in year_to_col.items():
        index[year] = float(betas[c])
    return index


def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
    """Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
    groups = pairs.group_by(group_col).agg(
        pl.col("year1"),
        pl.col("year2"),
        pl.col("log_ratio"),
        pl.col("weight"),
    )
    indices = {}
    n_pairs = {}
    for row in tqdm(
        groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"
    ):
        key = row[group_col]
        y1 = np.array(row["year1"], dtype=np.int32)
        y2 = np.array(row["year2"], dtype=np.int32)
        lr = np.array(row["log_ratio"], dtype=np.float64)
        w = np.array(row["weight"], dtype=np.float64)
        idx = solve_robust_index(y1, y2, lr, w)
        if idx:
            indices[key] = idx
            n_pairs[key] = len(y1)
    return indices, n_pairs


def compute_hedonic_index(
    input_path: Path,
    min_year: int,
    max_year: int,
    max_sale_year: int | None = None,
) -> dict[int, float]:
    """Quality-adjusted hedonic index: regress log(price) on features, average residual by year.

    Used as the ultimate shrinkage fallback for the repeat-sales index.
    If max_sale_year is set, only sales before that year are used (backtesting holdout).
    """
    effective_max = max_sale_year - 1 if max_sale_year is not None else max_year
    print("Computing hedonic index...")
    df = (
        pl.scan_parquet(input_path)
        .select(
            "Last known price",
            "Date of last transaction",
            "Property type",
            "Total floor area (sqm)",
        )
        .filter(
            pl.col("Last known price").is_not_null(),
            pl.col("Total floor area (sqm)").is_not_null(),
            pl.col("Total floor area (sqm)") > 0,
        )
        .with_columns(
            pl.col("Date of last transaction").dt.year().alias("sale_year"),
            type_group_expr(),
        )
        .filter(
            pl.col("type_group").is_not_null(),
            pl.col("sale_year").is_not_null(),
            pl.col("sale_year") >= min_year,
            pl.col("sale_year") <= effective_max,
        )
        .collect()
    )
    print(f"  {len(df):,} complete cases for hedonic model")

    # Target
    log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
    sale_years = df["sale_year"].to_numpy()

    # Build feature matrix (5 hedonic features + intercept)
    X = build_hedonic_features(df)
    F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
    print(f"  Feature matrix: {F.shape[0]:,} x {F.shape[1]}")

    # Step 1: regress log(price) on features -> quality score
    betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
    quality_score = F.astype(np.float64) @ betas
    residuals = log_price - quality_score

    # Step 2: average residual by year = hedonic index
    hedonic = {}
    for y in range(min_year, max_year + 1):
        mask = sale_years == y
        if mask.sum() > 0:
            hedonic[y] = float(np.mean(residuals[mask]))

    # Normalize: min_year = 0
    base = hedonic.get(min_year, 0.0)
    for y in hedonic:
        hedonic[y] -= base

    print(
        f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
    )
    return hedonic


EXTRAPOLATION_YEARS = 3


def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
    """Forward-fill missing years, with linear extrapolation beyond last known year."""
    if not index:
        return {y: 0.0 for y in range(min_year, max_year + 1)}

    sorted_years = sorted(index.keys())
    last_known_year = sorted_years[-1]

    # Forward fill up to last known year
    filled = {}
    last = 0.0
    for y in range(min_year, last_known_year + 1):
        if y in index:
            last = index[y]
        filled[y] = last

    # Linear extrapolation beyond last known year
    if last_known_year < max_year:
        recent = [
            (y, index[y])
            for y in sorted_years
            if y >= last_known_year - EXTRAPOLATION_YEARS
        ]
        if len(recent) >= 2:
            years_arr = np.array([r[0] for r in recent], dtype=np.float64)
            vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
            slope = np.polyfit(years_arr, vals_arr, 1)[0]
            for y in range(last_known_year + 1, max_year + 1):
                filled[y] = index[last_known_year] + slope * (y - last_known_year)
        else:
            for y in range(last_known_year + 1, max_year + 1):
                filled[y] = index[last_known_year]

    return filled


def build_index(
    input_path: Path,
    max_pair_year: int | None = None,
    postcodes_path: Path | None = None,
) -> pl.DataFrame:
    """Build the full price index from raw data.

    If max_pair_year is set, only pairs before that year are used (backtesting holdout).
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
    """
    pairs = extract_pairs(input_path, max_year2=max_pair_year)
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
    max_year = CURRENT_YEAR

    hedonic_idx = compute_hedonic_index(
        input_path, min_year, max_year, max_sale_year=max_pair_year
    )

    # Precompute hierarchy
    all_sectors = pairs["sector"].unique().to_list()
    sector_to_dist = {}
    dist_to_area = {}
    for s in all_sectors:
        d, a = hierarchy_keys(s)
        sector_to_dist[s] = d
        dist_to_area[d] = a

    # Process each type group + "All"
    all_type_groups = ["All"] + TYPE_GROUPS
    final = {}  # {type_group: {sector: {year: log_index}}}
    final_n = {}  # {type_group: {sector: n_pairs}}

    for tg in all_type_groups:
        print(f"\n--- {tg} ---")
        typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
        if len(typed) < MIN_PAIRS:
            print(f"  Skipping (only {len(typed)} pairs)")
            final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
            final_n[tg] = {s: 0 for s in all_sectors}
            continue

        print(f"  {len(typed):,} pairs")

        # National
        np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
        national_idx = solve_robust_index(
            np_arrs["year1"].to_numpy(),
            np_arrs["year2"].to_numpy(),
            np_arrs["log_ratio"].to_numpy(),
            np_arrs["weight"].to_numpy(),
        )
        national_n = len(typed)
        print(f"  National: {len(national_idx)} years")

        # Area, district, sector
        print("  Computing per-level indices:")
        area_idx, area_n = compute_indices_for_level(typed, "area")
        district_idx, district_n = compute_indices_for_level(typed, "district")
        sector_idx, sector_n = compute_indices_for_level(typed, "sector")
        print(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

        # Shrinkage: national -> hedonic first, then hierarchical
        print("  Applying shrinkage...")
        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
        sector_shrunk = hierarchical_shrinkage(
            sector_idx,
            sector_n,
            district_idx,
            district_n,
            area_idx,
            area_n,
            national_shrunk,
            all_sectors,
            sector_to_dist,
            dist_to_area,
            shrink_dicts,
        )

        # Spatial smoothing
        print("  Spatial smoothing...")
        sector_smoothed = spatial_smooth(
            sector_shrunk, centroids, sector_n, blend_dicts
        )

        # Forward fill
        for sec in all_sectors:
            sector_smoothed[sec] = forward_fill(
                sector_smoothed.get(sec, hedonic_idx), min_year, max_year
            )

        final[tg] = sector_smoothed
        final_n[tg] = sector_n

    # Assemble output
    print("\nAssembling output...")
    rows = []
    for tg in all_type_groups:
        for sec in all_sectors:
            n = final_n[tg].get(sec, 0)
            for year, log_idx in final[tg][sec].items():
                rows.append((sec, tg, year, log_idx, n))

    return pl.DataFrame(
        rows,
        schema={
            "sector": pl.String,
            "type_group": pl.String,
            "year": pl.Int32,
            "log_index": pl.Float64,
            "n_pairs": pl.Int64,
        },
        orient="row",
    ).sort("type_group", "sector", "year")


def main():
    parser = argparse.ArgumentParser(
        description="Build improved repeat-sales price index"
    )
    parser.add_argument("--input", type=Path, required=True)
    parser.add_argument(
        "--postcodes",
        type=Path,
        required=True,
        help="Path to postcode.parquet (for lat/lon centroids)",
    )
    parser.add_argument("--output", type=Path, required=True)
    args = parser.parse_args()

    result = build_index(args.input, postcodes_path=args.postcodes)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
    print(
        f"  {result['sector'].n_unique():,} sectors x {result['type_group'].n_unique()} types x {result['year'].n_unique()} years = {len(result):,} rows"
    )


if __name__ == "__main__":
    main()