Changes
This commit is contained in:
parent
6c90cf3c0f
commit
5b68c8da04
14 changed files with 687 additions and 551 deletions
|
|
@ -3,6 +3,7 @@
|
|||
Test set: properties with 2+ sales where the last sale is 2022-2025.
|
||||
Uses the second-to-last sale as input, predicts the last sale price.
|
||||
Compares index-based prediction against a naive baseline (raw input price).
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: backtest_results.parquet with predictions vs actuals.
|
||||
"""
|
||||
|
|
@ -15,6 +16,17 @@ import polars as pl
|
|||
|
||||
CURRENT_YEAR = 2025
|
||||
TEST_YEAR_MIN = 2022
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
|
|
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
print("Loading test set...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
|
||||
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
|
||||
|
|
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction: adjust input price by sector index change."""
|
||||
# Join index at input year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join index at actual year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
"""Index-based prediction with type-stratified fallback."""
|
||||
has_type_group = "type_group" in index.columns
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join type-specific index at input year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
|
||||
left_on=["sector", "type_group", "input_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at input year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at actual year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
|
||||
left_on=["sector", "type_group", "actual_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at actual year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
|
||||
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
|
||||
)
|
||||
else:
|
||||
# Unstratified index
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
|
|
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
"""Compute error metrics."""
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
|
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
|||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
"""Print a comparison table of metrics."""
|
||||
print("\n" + "=" * 55)
|
||||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
|
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
# Header
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>14s}"
|
||||
|
|
@ -133,7 +182,12 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
else:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue