Changes
This commit is contained in:
parent
3a3f899ea2
commit
128b3191e7
68 changed files with 28060 additions and 1152 deletions
|
|
@ -9,45 +9,60 @@ Output: backtest_results.parquet with predictions vs actuals.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
HEDONIC_COLUMNS,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
TEST_YEAR_MIN = 2022
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
def extract_test_set(
|
||||
input_path: Path, include_hedonic_cols: bool = False
|
||||
) -> pl.DataFrame:
|
||||
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
|
||||
print("Loading test set...")
|
||||
cols = ["Postcode", "historical_prices", "Property type"]
|
||||
if include_hedonic_cols:
|
||||
for c in HEDONIC_COLUMNS:
|
||||
if c not in cols:
|
||||
cols.append(c)
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.select(cols)
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
|
||||
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("year")
|
||||
.alias("actual_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("price")
|
||||
.alias("actual_price"),
|
||||
# Second-to-last sale (input)
|
||||
pl.col("historical_prices").list.get(-2).struct.field("year").alias("input_year"),
|
||||
pl.col("historical_prices").list.get(-2).struct.field("price").alias("input_price"),
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("year")
|
||||
.alias("input_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("price")
|
||||
.alias("input_price"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("actual_year") >= TEST_YEAR_MIN,
|
||||
|
|
@ -71,7 +86,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
|
||||
# Join type-specific index at input year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
|
||||
idx_typed.select(
|
||||
"sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
|
||||
),
|
||||
left_on=["sector", "type_group", "input_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
|
|
@ -85,7 +102,12 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
)
|
||||
# Join type-specific index at actual year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
|
||||
idx_typed.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"year",
|
||||
pl.col("log_index").alias("li_act_typed"),
|
||||
),
|
||||
left_on=["sector", "type_group", "actual_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
|
|
@ -99,19 +121,27 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
)
|
||||
|
||||
test = test.with_columns(
|
||||
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
|
||||
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
|
||||
pl.col("li_in_typed")
|
||||
.fill_null(pl.col("li_in_all"))
|
||||
.alias("log_index_input"),
|
||||
pl.col("li_act_typed")
|
||||
.fill_null(pl.col("li_act_all"))
|
||||
.alias("log_index_actual"),
|
||||
)
|
||||
else:
|
||||
# Unstratified index
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
index.select(
|
||||
"sector", "year", pl.col("log_index").alias("log_index_input")
|
||||
),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
index.select(
|
||||
"sector", "year", pl.col("log_index").alias("log_index_actual")
|
||||
),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
|
|
@ -121,7 +151,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
(
|
||||
pl.col("input_price").cast(pl.Float64)
|
||||
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
|
||||
).fill_null(pl.col("input_price").cast(pl.Float64)).alias("predicted"),
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
.alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
||||
|
|
@ -150,7 +182,15 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
||||
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
|
||||
metric_names = [
|
||||
"MdAPE (%)",
|
||||
"% within 10%",
|
||||
"% within 20%",
|
||||
"% within 30%",
|
||||
"MAE (£)",
|
||||
"Mean signed error (£)",
|
||||
"n",
|
||||
]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
header = f"{'Metric':<25s}"
|
||||
|
|
@ -176,20 +216,37 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backtest price estimation model")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output backtest_results.parquet")
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output backtest_results.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hedonic-model",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Path to hedonic_model.json (optional)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
print(
|
||||
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
else:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
print(
|
||||
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
|
||||
)
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
has_hedonic = args.hedonic_model is not None
|
||||
test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)
|
||||
|
||||
print("\nPredicting with price index...")
|
||||
test = predict(test, index)
|
||||
|
|
@ -197,19 +254,126 @@ def main():
|
|||
# Compute and print metrics
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
metrics = {
|
||||
"Naive": compute_metrics(actual, test["input_price"].to_numpy().astype(np.float64)),
|
||||
"Index": compute_metrics(actual, test["predicted"].to_numpy().astype(np.float64)),
|
||||
"Naive": compute_metrics(
|
||||
actual, test["input_price"].to_numpy().astype(np.float64)
|
||||
),
|
||||
"Index": compute_metrics(
|
||||
actual, test["predicted"].to_numpy().astype(np.float64)
|
||||
),
|
||||
}
|
||||
|
||||
# Hedonic blending
|
||||
if has_hedonic:
|
||||
print("\nApplying hedonic blending...")
|
||||
with open(args.hedonic_model) as f:
|
||||
model = json.load(f)
|
||||
type_models = model["type_models"]
|
||||
|
||||
# Identify eligible rows for hedonic estimate
|
||||
hedonic_mask = (
|
||||
pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& pl.col("type_group").is_not_null()
|
||||
)
|
||||
eligible_mask = test.select(hedonic_mask).to_series()
|
||||
eligible = test.filter(eligible_mask)
|
||||
|
||||
if len(eligible) > 0:
|
||||
log_fa = np.log(
|
||||
np.maximum(
|
||||
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
|
||||
1.0,
|
||||
)
|
||||
)
|
||||
sectors = eligible["sector"].to_list()
|
||||
types = eligible["type_group"].to_list()
|
||||
|
||||
# Per-type hedonic prediction
|
||||
log_hedonic = np.empty(len(eligible))
|
||||
for i in range(len(eligible)):
|
||||
tm = type_models.get(types[i])
|
||||
if tm is None:
|
||||
log_hedonic[i] = np.nan
|
||||
continue
|
||||
alpha = tm["sector_intercepts"].get(
|
||||
sectors[i], tm["national_intercept"]
|
||||
)
|
||||
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
|
||||
|
||||
valid = np.isfinite(log_hedonic)
|
||||
|
||||
# Hold years: input_year to actual_year (simulating real prediction)
|
||||
input_years = eligible["input_year"].to_numpy().astype(np.float64)
|
||||
actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
|
||||
hold_years = np.maximum(actual_years - input_years, 0.0)
|
||||
|
||||
log_index_pred = np.log(
|
||||
np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
|
||||
)
|
||||
|
||||
# Sweep tau values (only on valid hedonic rows)
|
||||
tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
|
||||
actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
|
||||
best_tau = 15.0
|
||||
best_mdape = float("inf")
|
||||
|
||||
print(f"\n tau sweep ({valid.sum():,} eligible properties):")
|
||||
for tau in tau_values:
|
||||
blend_w = hold_years / (hold_years + tau)
|
||||
log_blended = np.where(
|
||||
valid,
|
||||
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
|
||||
log_index_pred,
|
||||
)
|
||||
blended = np.exp(log_blended)
|
||||
m = compute_metrics(actual_eligible, blended)
|
||||
marker = ""
|
||||
if m["MdAPE (%)"] < best_mdape:
|
||||
best_mdape = m["MdAPE (%)"]
|
||||
best_tau = tau
|
||||
marker = " <-- best"
|
||||
print(
|
||||
f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
|
||||
f"within 10%={m['% within 10%']:>5.1f}%{marker}"
|
||||
)
|
||||
|
||||
print(f"\n Best tau = {best_tau}")
|
||||
|
||||
# Compute blended predictions with best tau for full test set
|
||||
blend_w = hold_years / (hold_years + best_tau)
|
||||
log_blended = np.where(
|
||||
valid,
|
||||
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
|
||||
log_index_pred,
|
||||
)
|
||||
blended_eligible = np.exp(log_blended)
|
||||
|
||||
# Merge back: for non-eligible rows, use index prediction
|
||||
blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
|
||||
eligible_indices = eligible_mask.arg_true()
|
||||
for i, idx in enumerate(eligible_indices):
|
||||
blended_all[idx] = blended_eligible[i]
|
||||
|
||||
test = test.with_columns(
|
||||
pl.Series("blended", blended_all, dtype=pl.Float64),
|
||||
)
|
||||
metrics["Blended"] = compute_metrics(actual, blended_all)
|
||||
|
||||
print_metrics_table(metrics)
|
||||
|
||||
# Save results
|
||||
result = test.select(
|
||||
"Postcode", "sector",
|
||||
"input_year", "input_price",
|
||||
"actual_year", "actual_price",
|
||||
result_cols = [
|
||||
"Postcode",
|
||||
"sector",
|
||||
"input_year",
|
||||
"input_price",
|
||||
"actual_year",
|
||||
"actual_price",
|
||||
"predicted",
|
||||
)
|
||||
]
|
||||
if "blended" in test.columns:
|
||||
result_cols.append("blended")
|
||||
result = test.select(result_cols)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue