perfect-postcode/pipeline/transform/price_backtest.py
2026-02-15 22:39:53 +00:00

385 lines
12 KiB
Python

"""Backtesting: Evaluate price index model on held-out recent sales.
Test set: properties with 2+ sales where the last sale is 2022-2025.
Uses the second-to-last sale as input, predicts the last sale price.
Compares index-based prediction against a naive baseline (raw input price).
Uses type-stratified index when available, falling back to "All" type.
Output: backtest_results.parquet with predictions vs actuals.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
sector_expr,
type_group_expr,
)
TEST_YEAR_MIN = 2022
def extract_test_set(
input_path: Path, include_hedonic_cols: bool = False
) -> pl.DataFrame:
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
print("Loading test set...")
cols = ["Postcode", "historical_prices", "Property type"]
if include_hedonic_cols:
for c in HEDONIC_COLUMNS:
if c not in cols:
cols.append(c)
df = (
pl.scan_parquet(input_path)
.select(cols)
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
sector_expr(),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices")
.list.last()
.struct.field("year")
.alias("actual_year"),
pl.col("historical_prices")
.list.last()
.struct.field("price")
.alias("actual_price"),
# Second-to-last sale (input)
pl.col("historical_prices")
.list.get(-2)
.struct.field("year")
.alias("input_year"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("price")
.alias("input_price"),
)
.filter(
pl.col("actual_year") >= TEST_YEAR_MIN,
pl.col("input_price") > 0,
pl.col("actual_price") > 0,
pl.col("actual_year") > pl.col("input_year"),
)
.collect()
)
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
return df
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction with type-stratified fallback."""
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at input year
test = test.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
),
left_on=["sector", "type_group", "input_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at input year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at actual year
test = test.join(
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("li_act_typed"),
),
left_on=["sector", "type_group", "actual_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at actual year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
pl.col("li_in_typed")
.fill_null(pl.col("li_in_all"))
.alias("log_index_input"),
pl.col("li_act_typed")
.fill_null(pl.col("li_act_all"))
.alias("log_index_actual"),
)
else:
# Unstratified index
test = test.join(
index.select(
"sector", "year", pl.col("log_index").alias("log_index_input")
),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
test = test.join(
index.select(
"sector", "year", pl.col("log_index").alias("log_index_actual")
),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
(
pl.col("input_price").cast(pl.Float64)
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
)
return test
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
actual = actual[valid]
predicted = predicted[valid]
ape = np.abs(predicted - actual) / actual
signed_err = predicted - actual
return {
"MdAPE (%)": float(np.median(ape) * 100),
"% within 10%": float(np.mean(ape <= 0.10) * 100),
"% within 20%": float(np.mean(ape <= 0.20) * 100),
"% within 30%": float(np.mean(ape <= 0.30) * 100),
"MAE (£)": float(np.mean(np.abs(signed_err))),
"Mean signed error (£)": float(np.mean(signed_err)),
"n": int(len(actual)),
}
def print_metrics_table(metrics_by_stage: dict):
print("\n" + "=" * 55)
print("BACKTEST RESULTS")
print("=" * 55)
metric_names = [
"MdAPE (%)",
"% within 10%",
"% within 20%",
"% within 30%",
"MAE (£)",
"Mean signed error (£)",
"n",
]
stages = list(metrics_by_stage.keys())
header = f"{'Metric':<25s}"
for stage in stages:
header += f" {stage:>14s}"
print(header)
print("-" * 55)
for metric in metric_names:
row = f"{metric:<25s}"
for stage in stages:
val = metrics_by_stage[stage][metric]
if metric == "n":
row += f" {val:>14,d}"
elif "£" in metric:
row += f" {val:>13,.0f}"
else:
row += f" {val:>13.1f}%"
print(row)
print("=" * 55)
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
)
has_hedonic = args.hedonic_model is not None
test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)
print("\nPredicting with price index...")
test = predict(test, index)
# Compute and print metrics
actual = test["actual_price"].to_numpy().astype(np.float64)
metrics = {
"Naive": compute_metrics(
actual, test["input_price"].to_numpy().astype(np.float64)
),
"Index": compute_metrics(
actual, test["predicted"].to_numpy().astype(np.float64)
),
}
# Hedonic blending
if has_hedonic:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
# Identify eligible rows for hedonic estimate
hedonic_mask = (
pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible_mask = test.select(hedonic_mask).to_series()
eligible = test.filter(eligible_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years: input_year to actual_year (simulating real prediction)
input_years = eligible["input_year"].to_numpy().astype(np.float64)
actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(actual_years - input_years, 0.0)
log_index_pred = np.log(
np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
)
# Sweep tau values (only on valid hedonic rows)
tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
best_tau = 15.0
best_mdape = float("inf")
print(f"\n tau sweep ({valid.sum():,} eligible properties):")
for tau in tau_values:
blend_w = hold_years / (hold_years + tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended = np.exp(log_blended)
m = compute_metrics(actual_eligible, blended)
marker = ""
if m["MdAPE (%)"] < best_mdape:
best_mdape = m["MdAPE (%)"]
best_tau = tau
marker = " <-- best"
print(
f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
f"within 10%={m['% within 10%']:>5.1f}%{marker}"
)
print(f"\n Best tau = {best_tau}")
# Compute blended predictions with best tau for full test set
blend_w = hold_years / (hold_years + best_tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended_eligible = np.exp(log_blended)
# Merge back: for non-eligible rows, use index prediction
blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
eligible_indices = eligible_mask.arg_true()
for i, idx in enumerate(eligible_indices):
blended_all[idx] = blended_eligible[i]
test = test.with_columns(
pl.Series("blended", blended_all, dtype=pl.Float64),
)
metrics["Blended"] = compute_metrics(actual, blended_all)
print_metrics_table(metrics)
# Save results
result_cols = [
"Postcode",
"sector",
"input_year",
"input_price",
"actual_year",
"actual_price",
"predicted",
]
if "blended" in test.columns:
result_cols.append("blended")
result = test.select(result_cols)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {len(result):,} rows")
if __name__ == "__main__":
main()