This commit is contained in:
Andras Schmelczer 2026-02-08 18:40:17 +00:00
parent 6c90cf3c0f
commit 5b68c8da04
14 changed files with 687 additions and 551 deletions

View file

@ -3,6 +3,7 @@
Test set: properties with 2+ sales where the last sale is 2022-2025.
Uses the second-to-last sale as input, predicts the last sale price.
Compares index-based prediction against a naive baseline (raw input price).
Uses type-stratified index when available, falling back to "All" type.
Output: backtest_results.parquet with predictions vs actuals.
"""
@ -15,6 +16,17 @@ import polars as pl
CURRENT_YEAR = 2025
TEST_YEAR_MIN = 2022
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
def type_group_expr():
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def extract_test_set(input_path: Path) -> pl.DataFrame:
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
print("Loading test set...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices")
.select("Postcode", "historical_prices", "Property type")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction: adjust input price by sector index change."""
# Join index at input year
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join index at actual year
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
"""Index-based prediction with type-stratified fallback."""
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at input year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
left_on=["sector", "type_group", "input_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at input year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at actual year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
left_on=["sector", "type_group", "actual_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at actual year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
)
else:
# Unstratified index
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
(
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
"""Compute error metrics."""
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
actual = actual[valid]
predicted = predicted[valid]
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
def print_metrics_table(metrics_by_stage: dict):
"""Print a comparison table of metrics."""
print("\n" + "=" * 55)
print("BACKTEST RESULTS")
print("=" * 55)
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
stages = list(metrics_by_stage.keys())
# Header
header = f"{'Metric':<25s}"
for stage in stages:
header += f" {stage:>14s}"
@ -133,7 +182,12 @@ def main():
args = parser.parse_args()
index = pl.read_parquet(args.index)
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
has_type_group = "type_group" in index.columns
if has_type_group:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups")
else:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
test = extract_test_set(args.input)