perfect-postcode/pipeline/transform/price_backtest.py

"""Backtesting: Evaluate price index model on held-out recent sales.

Test set: properties with 2+ sales where the last sale is 2022-2025.
Uses the second-to-last sale as input, predicts the last sale price.
Compares index-based prediction against a naive baseline (raw input price).
Uses type-stratified index when available, falling back to "All" type.

Output: backtest_results.parquet with predictions vs actuals.
"""

import argparse
import json
from pathlib import Path

import numpy as np
import polars as pl

from pipeline.transform._price_utils import (
    CURRENT_YEAR,
    HEDONIC_COLUMNS,
    sector_expr,
    type_group_expr,
)

TEST_YEAR_MIN = 2022


def extract_test_set(
    input_path: Path, include_hedonic_cols: bool = False
) -> pl.DataFrame:
    """Extract test pairs: second-to-last sale as input, last sale as ground truth."""
    print("Loading test set...")
    cols = ["Postcode", "historical_prices", "Property type"]
    if include_hedonic_cols:
        for c in HEDONIC_COLUMNS:
            if c not in cols:
                cols.append(c)
    df = (
        pl.scan_parquet(input_path)
        .select(cols)
        .filter(
            pl.col("Postcode").is_not_null(),
            pl.col("historical_prices").list.len() >= 2,
        )
        .with_columns(
            sector_expr(),
            type_group_expr(),
            # Last sale (ground truth)
            pl.col("historical_prices")
            .list.last()
            .struct.field("year")
            .alias("actual_year"),
            pl.col("historical_prices")
            .list.last()
            .struct.field("price")
            .alias("actual_price"),
            # Second-to-last sale (input)
            pl.col("historical_prices")
            .list.get(-2)
            .struct.field("year")
            .alias("input_year"),
            pl.col("historical_prices")
            .list.get(-2)
            .struct.field("price")
            .alias("input_price"),
        )
        .filter(
            pl.col("actual_year") >= TEST_YEAR_MIN,
            pl.col("input_price") > 0,
            pl.col("actual_price") > 0,
            pl.col("actual_year") > pl.col("input_year"),
        )
        .collect()
    )
    print(f"  {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
    return df


def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
    """Index-based prediction with type-stratified fallback."""
    has_type_group = "type_group" in index.columns

    if has_type_group:
        idx_typed = index.filter(pl.col("type_group") != "All")
        idx_all = index.filter(pl.col("type_group") == "All")

        # Join type-specific index at input year
        test = test.join(
            idx_typed.select(
                "sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
            ),
            left_on=["sector", "type_group", "input_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
        )
        # Join "All" index at input year
        test = test.join(
            idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
            left_on=["sector", "input_year"],
            right_on=["sector", "year"],
            how="left",
        )
        # Join type-specific index at actual year
        test = test.join(
            idx_typed.select(
                "sector",
                "type_group",
                "year",
                pl.col("log_index").alias("li_act_typed"),
            ),
            left_on=["sector", "type_group", "actual_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
        )
        # Join "All" index at actual year
        test = test.join(
            idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
            left_on=["sector", "actual_year"],
            right_on=["sector", "year"],
            how="left",
        )

        test = test.with_columns(
            pl.col("li_in_typed")
            .fill_null(pl.col("li_in_all"))
            .alias("log_index_input"),
            pl.col("li_act_typed")
            .fill_null(pl.col("li_act_all"))
            .alias("log_index_actual"),
        )
    else:
        # Unstratified index
        test = test.join(
            index.select(
                "sector", "year", pl.col("log_index").alias("log_index_input")
            ),
            left_on=["sector", "input_year"],
            right_on=["sector", "year"],
            how="left",
        )
        test = test.join(
            index.select(
                "sector", "year", pl.col("log_index").alias("log_index_actual")
            ),
            left_on=["sector", "actual_year"],
            right_on=["sector", "year"],
            how="left",
        )

    test = test.with_columns(
        (
            pl.col("input_price").cast(pl.Float64)
            * (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
        )
        .fill_null(pl.col("input_price").cast(pl.Float64))
        .alias("predicted"),
    )
    return test


def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
    valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
    actual = actual[valid]
    predicted = predicted[valid]

    ape = np.abs(predicted - actual) / actual
    signed_err = predicted - actual

    return {
        "MdAPE (%)": float(np.median(ape) * 100),
        "% within 10%": float(np.mean(ape <= 0.10) * 100),
        "% within 20%": float(np.mean(ape <= 0.20) * 100),
        "% within 30%": float(np.mean(ape <= 0.30) * 100),
        "MAE (£)": float(np.mean(np.abs(signed_err))),
        "Mean signed error (£)": float(np.mean(signed_err)),
        "n": int(len(actual)),
    }


def print_metrics_table(metrics_by_stage: dict):
    print("\n" + "=" * 55)
    print("BACKTEST RESULTS")
    print("=" * 55)

    metric_names = [
        "MdAPE (%)",
        "% within 10%",
        "% within 20%",
        "% within 30%",
        "MAE (£)",
        "Mean signed error (£)",
        "n",
    ]
    stages = list(metrics_by_stage.keys())

    header = f"{'Metric':<25s}"
    for stage in stages:
        header += f" {stage:>14s}"
    print(header)
    print("-" * 55)

    for metric in metric_names:
        row = f"{metric:<25s}"
        for stage in stages:
            val = metrics_by_stage[stage][metric]
            if metric == "n":
                row += f" {val:>14,d}"
            elif "£" in metric:
                row += f" {val:>13,.0f}"
            else:
                row += f" {val:>13.1f}%"
        print(row)

    print("=" * 55)


def main():
    parser = argparse.ArgumentParser(description="Backtest price estimation model")
    parser.add_argument(
        "--input", type=Path, required=True, help="Path to wide.parquet"
    )
    parser.add_argument(
        "--index", type=Path, required=True, help="Path to price_index.parquet"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output backtest_results.parquet"
    )
    parser.add_argument(
        "--hedonic-model",
        type=Path,
        default=None,
        help="Path to hedonic_model.json (optional)",
    )
    args = parser.parse_args()

    index = pl.read_parquet(args.index)
    has_type_group = "type_group" in index.columns
    if has_type_group:
        print(
            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
            f"{index['type_group'].n_unique()} type groups"
        )
    else:
        print(
            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
        )

    has_hedonic = args.hedonic_model is not None
    test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)

    print("\nPredicting with price index...")
    test = predict(test, index)

    # Compute and print metrics
    actual = test["actual_price"].to_numpy().astype(np.float64)
    metrics = {
        "Naive": compute_metrics(
            actual, test["input_price"].to_numpy().astype(np.float64)
        ),
        "Index": compute_metrics(
            actual, test["predicted"].to_numpy().astype(np.float64)
        ),
    }

    # Hedonic blending
    if has_hedonic:
        print("\nApplying hedonic blending...")
        with open(args.hedonic_model) as f:
            model = json.load(f)
        type_models = model["type_models"]

        # Identify eligible rows for hedonic estimate
        hedonic_mask = (
            pl.col("Total floor area (sqm)").is_not_null()
            & (pl.col("Total floor area (sqm)") > 0)
            & pl.col("type_group").is_not_null()
        )
        eligible_mask = test.select(hedonic_mask).to_series()
        eligible = test.filter(eligible_mask)

        if len(eligible) > 0:
            log_fa = np.log(
                np.maximum(
                    eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
                    1.0,
                )
            )
            sectors = eligible["sector"].to_list()
            types = eligible["type_group"].to_list()

            # Per-type hedonic prediction
            log_hedonic = np.empty(len(eligible))
            for i in range(len(eligible)):
                tm = type_models.get(types[i])
                if tm is None:
                    log_hedonic[i] = np.nan
                    continue
                alpha = tm["sector_intercepts"].get(
                    sectors[i], tm["national_intercept"]
                )
                log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha

            valid = np.isfinite(log_hedonic)

            # Hold years: input_year to actual_year (simulating real prediction)
            input_years = eligible["input_year"].to_numpy().astype(np.float64)
            actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
            hold_years = np.maximum(actual_years - input_years, 0.0)

            log_index_pred = np.log(
                np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
            )

            # Sweep tau values (only on valid hedonic rows)
            tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
            actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
            best_tau = 15.0
            best_mdape = float("inf")

            print(f"\n  tau sweep ({valid.sum():,} eligible properties):")
            for tau in tau_values:
                blend_w = hold_years / (hold_years + tau)
                log_blended = np.where(
                    valid,
                    (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
                    log_index_pred,
                )
                blended = np.exp(log_blended)
                m = compute_metrics(actual_eligible, blended)
                marker = ""
                if m["MdAPE (%)"] < best_mdape:
                    best_mdape = m["MdAPE (%)"]
                    best_tau = tau
                    marker = " <-- best"
                print(
                    f"    tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
                    f"within 10%={m['% within 10%']:>5.1f}%{marker}"
                )

            print(f"\n  Best tau = {best_tau}")

            # Compute blended predictions with best tau for full test set
            blend_w = hold_years / (hold_years + best_tau)
            log_blended = np.where(
                valid,
                (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
                log_index_pred,
            )
            blended_eligible = np.exp(log_blended)

            # Merge back: for non-eligible rows, use index prediction
            blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
            eligible_indices = eligible_mask.arg_true()
            for i, idx in enumerate(eligible_indices):
                blended_all[idx] = blended_eligible[i]

            test = test.with_columns(
                pl.Series("blended", blended_all, dtype=pl.Float64),
            )
            metrics["Blended"] = compute_metrics(actual, blended_all)

    print_metrics_table(metrics)

    # Save results
    result_cols = [
        "Postcode",
        "sector",
        "input_year",
        "input_price",
        "actual_year",
        "actual_price",
        "predicted",
    ]
    if "blended" in test.columns:
        result_cols.append("blended")
    result = test.select(result_cols)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
    print(f"  {len(result):,} rows")


if __name__ == "__main__":
    main()