"""Backtesting: Evaluate price index model on held-out recent sales. Test set: properties with 2+ sales where the last sale is 2022-2025. Uses the second-to-last sale as input, predicts the last sale price. Compares index-based prediction against a naive baseline (raw input price). Uses type-stratified index when available, falling back to "All" type. Output: backtest_results.parquet with predictions vs actuals. """ import argparse import json from pathlib import Path import numpy as np import polars as pl from pipeline.transform._price_utils import ( CURRENT_YEAR, HEDONIC_COLUMNS, sector_expr, type_group_expr, ) TEST_YEAR_MIN = 2022 def extract_test_set( input_path: Path, include_hedonic_cols: bool = False ) -> pl.DataFrame: """Extract test pairs: second-to-last sale as input, last sale as ground truth.""" print("Loading test set...") cols = ["Postcode", "historical_prices", "Property type"] if include_hedonic_cols: for c in HEDONIC_COLUMNS: if c not in cols: cols.append(c) df = ( pl.scan_parquet(input_path) .select(cols) .filter( pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2, ) .with_columns( sector_expr(), type_group_expr(), # Last sale (ground truth) pl.col("historical_prices") .list.last() .struct.field("year") .alias("actual_year"), pl.col("historical_prices") .list.last() .struct.field("price") .alias("actual_price"), # Second-to-last sale (input) pl.col("historical_prices") .list.get(-2) .struct.field("year") .alias("input_year"), pl.col("historical_prices") .list.get(-2) .struct.field("price") .alias("input_price"), ) .filter( pl.col("actual_year") >= TEST_YEAR_MIN, pl.col("input_price") > 0, pl.col("actual_price") > 0, pl.col("actual_year") > pl.col("input_year"), ) .collect() ) print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})") return df def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame: """Index-based prediction with type-stratified fallback.""" has_type_group = "type_group" in index.columns if has_type_group: idx_typed = index.filter(pl.col("type_group") != "All") idx_all = index.filter(pl.col("type_group") == "All") # Join type-specific index at input year test = test.join( idx_typed.select( "sector", "type_group", "year", pl.col("log_index").alias("li_in_typed") ), left_on=["sector", "type_group", "input_year"], right_on=["sector", "type_group", "year"], how="left", ) # Join "All" index at input year test = test.join( idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")), left_on=["sector", "input_year"], right_on=["sector", "year"], how="left", ) # Join type-specific index at actual year test = test.join( idx_typed.select( "sector", "type_group", "year", pl.col("log_index").alias("li_act_typed"), ), left_on=["sector", "type_group", "actual_year"], right_on=["sector", "type_group", "year"], how="left", ) # Join "All" index at actual year test = test.join( idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")), left_on=["sector", "actual_year"], right_on=["sector", "year"], how="left", ) test = test.with_columns( pl.col("li_in_typed") .fill_null(pl.col("li_in_all")) .alias("log_index_input"), pl.col("li_act_typed") .fill_null(pl.col("li_act_all")) .alias("log_index_actual"), ) else: # Unstratified index test = test.join( index.select( "sector", "year", pl.col("log_index").alias("log_index_input") ), left_on=["sector", "input_year"], right_on=["sector", "year"], how="left", ) test = test.join( index.select( "sector", "year", pl.col("log_index").alias("log_index_actual") ), left_on=["sector", "actual_year"], right_on=["sector", "year"], how="left", ) test = test.with_columns( ( pl.col("input_price").cast(pl.Float64) * (pl.col("log_index_actual") - pl.col("log_index_input")).exp() ) .fill_null(pl.col("input_price").cast(pl.Float64)) .alias("predicted"), ) return test def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict: valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) actual = actual[valid] predicted = predicted[valid] ape = np.abs(predicted - actual) / actual signed_err = predicted - actual return { "MdAPE (%)": float(np.median(ape) * 100), "% within 10%": float(np.mean(ape <= 0.10) * 100), "% within 20%": float(np.mean(ape <= 0.20) * 100), "% within 30%": float(np.mean(ape <= 0.30) * 100), "MAE (£)": float(np.mean(np.abs(signed_err))), "Mean signed error (£)": float(np.mean(signed_err)), "n": int(len(actual)), } def print_metrics_table(metrics_by_stage: dict): print("\n" + "=" * 55) print("BACKTEST RESULTS") print("=" * 55) metric_names = [ "MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n", ] stages = list(metrics_by_stage.keys()) header = f"{'Metric':<25s}" for stage in stages: header += f" {stage:>14s}" print(header) print("-" * 55) for metric in metric_names: row = f"{metric:<25s}" for stage in stages: val = metrics_by_stage[stage][metric] if metric == "n": row += f" {val:>14,d}" elif "£" in metric: row += f" {val:>13,.0f}" else: row += f" {val:>13.1f}%" print(row) print("=" * 55) def main(): parser = argparse.ArgumentParser(description="Backtest price estimation model") parser.add_argument( "--input", type=Path, required=True, help="Path to wide.parquet" ) parser.add_argument( "--index", type=Path, required=True, help="Path to price_index.parquet" ) parser.add_argument( "--output", type=Path, required=True, help="Output backtest_results.parquet" ) parser.add_argument( "--hedonic-model", type=Path, default=None, help="Path to hedonic_model.json (optional)", ) args = parser.parse_args() index = pl.read_parquet(args.index) has_type_group = "type_group" in index.columns if has_type_group: print( f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, " f"{index['type_group'].n_unique()} type groups" ) else: print( f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors" ) has_hedonic = args.hedonic_model is not None test = extract_test_set(args.input, include_hedonic_cols=has_hedonic) print("\nPredicting with price index...") test = predict(test, index) # Compute and print metrics actual = test["actual_price"].to_numpy().astype(np.float64) metrics = { "Naive": compute_metrics( actual, test["input_price"].to_numpy().astype(np.float64) ), "Index": compute_metrics( actual, test["predicted"].to_numpy().astype(np.float64) ), } # Hedonic blending if has_hedonic: print("\nApplying hedonic blending...") with open(args.hedonic_model) as f: model = json.load(f) type_models = model["type_models"] # Identify eligible rows for hedonic estimate hedonic_mask = ( pl.col("Total floor area (sqm)").is_not_null() & (pl.col("Total floor area (sqm)") > 0) & pl.col("type_group").is_not_null() ) eligible_mask = test.select(hedonic_mask).to_series() eligible = test.filter(eligible_mask) if len(eligible) > 0: log_fa = np.log( np.maximum( eligible["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0, ) ) sectors = eligible["sector"].to_list() types = eligible["type_group"].to_list() # Per-type hedonic prediction log_hedonic = np.empty(len(eligible)) for i in range(len(eligible)): tm = type_models.get(types[i]) if tm is None: log_hedonic[i] = np.nan continue alpha = tm["sector_intercepts"].get( sectors[i], tm["national_intercept"] ) log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha valid = np.isfinite(log_hedonic) # Hold years: input_year to actual_year (simulating real prediction) input_years = eligible["input_year"].to_numpy().astype(np.float64) actual_years = eligible["actual_year"].to_numpy().astype(np.float64) hold_years = np.maximum(actual_years - input_years, 0.0) log_index_pred = np.log( np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0) ) # Sweep tau values (only on valid hedonic rows) tau_values = [5.0, 10.0, 15.0, 20.0, 30.0] actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64) best_tau = 15.0 best_mdape = float("inf") print(f"\n tau sweep ({valid.sum():,} eligible properties):") for tau in tau_values: blend_w = hold_years / (hold_years + tau) log_blended = np.where( valid, (1 - blend_w) * log_index_pred + blend_w * log_hedonic, log_index_pred, ) blended = np.exp(log_blended) m = compute_metrics(actual_eligible, blended) marker = "" if m["MdAPE (%)"] < best_mdape: best_mdape = m["MdAPE (%)"] best_tau = tau marker = " <-- best" print( f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, " f"within 10%={m['% within 10%']:>5.1f}%{marker}" ) print(f"\n Best tau = {best_tau}") # Compute blended predictions with best tau for full test set blend_w = hold_years / (hold_years + best_tau) log_blended = np.where( valid, (1 - blend_w) * log_index_pred + blend_w * log_hedonic, log_index_pred, ) blended_eligible = np.exp(log_blended) # Merge back: for non-eligible rows, use index prediction blended_all = test["predicted"].to_numpy().astype(np.float64).copy() eligible_indices = eligible_mask.arg_true() for i, idx in enumerate(eligible_indices): blended_all[idx] = blended_eligible[i] test = test.with_columns( pl.Series("blended", blended_all, dtype=pl.Float64), ) metrics["Blended"] = compute_metrics(actual, blended_all) print_metrics_table(metrics) # Save results result_cols = [ "Postcode", "sector", "input_year", "input_price", "actual_year", "actual_price", "predicted", ] if "blended" in test.columns: result_cols.append("blended") result = test.select(result_cols) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"\nWrote {args.output} ({size_mb:.1f} MB)") print(f" {len(result):,} rows") if __name__ == "__main__": main()