Changes

2026-02-14 12:53:29 +00:00 · 2026-02-14 12:53:29 +00:00 · 128b3191e7
commit 128b3191e7
parent 3a3f899ea2
68 changed files with 28060 additions and 1152 deletions
--- a/pipeline/transform/price_backtest.py
+++ b/pipeline/transform/price_backtest.py
@ -9,45 +9,60 @@ Output: backtest_results.parquet with predictions vs actuals.
 """

 import argparse
+import json
 from pathlib import Path

 import numpy as np
 import polars as pl

-CURRENT_YEAR = 2025
+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    HEDONIC_COLUMNS,
+    sector_expr,
+    type_group_expr,
+)
+
 TEST_YEAR_MIN = 2022
-TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]


-def type_group_expr():
-    return (
-        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
-        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
-        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
-        .otherwise(pl.lit(None))
-        .alias("type_group")
-    )
-
-
-def extract_test_set(input_path: Path) -> pl.DataFrame:
+def extract_test_set(
+    input_path: Path, include_hedonic_cols: bool = False
+) -> pl.DataFrame:
    """Extract test pairs: second-to-last sale as input, last sale as ground truth."""
    print("Loading test set...")
+    cols = ["Postcode", "historical_prices", "Property type"]
+    if include_hedonic_cols:
+        for c in HEDONIC_COLUMNS:
+            if c not in cols:
+                cols.append(c)
    df = (
        pl.scan_parquet(input_path)
-        .select("Postcode", "historical_prices", "Property type")
+        .select(cols)
        .filter(
            pl.col("Postcode").is_not_null(),
            pl.col("historical_prices").list.len() >= 2,
        )
        .with_columns(
-            pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
+            sector_expr(),
            type_group_expr(),
            # Last sale (ground truth)
-            pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
-            pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("year")
+            .alias("actual_year"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("price")
+            .alias("actual_price"),
            # Second-to-last sale (input)
-            pl.col("historical_prices").list.get(-2).struct.field("year").alias("input_year"),
-            pl.col("historical_prices").list.get(-2).struct.field("price").alias("input_price"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("year")
+            .alias("input_year"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("price")
+            .alias("input_price"),
        )
        .filter(
            pl.col("actual_year") >= TEST_YEAR_MIN,
@ -71,7 +86,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:

        # Join type-specific index at input year
        test = test.join(
-            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
+            ),
            left_on=["sector", "type_group", "input_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
@ -85,7 +102,12 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        )
        # Join type-specific index at actual year
        test = test.join(
-            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
+            idx_typed.select(
+                "sector",
+                "type_group",
+                "year",
+                pl.col("log_index").alias("li_act_typed"),
+            ),
            left_on=["sector", "type_group", "actual_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
@ -99,19 +121,27 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        )

        test = test.with_columns(
-            pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
-            pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
+            pl.col("li_in_typed")
+            .fill_null(pl.col("li_in_all"))
+            .alias("log_index_input"),
+            pl.col("li_act_typed")
+            .fill_null(pl.col("li_act_all"))
+            .alias("log_index_actual"),
        )
    else:
        # Unstratified index
        test = test.join(
-            index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
+            index.select(
+                "sector", "year", pl.col("log_index").alias("log_index_input")
+            ),
            left_on=["sector", "input_year"],
            right_on=["sector", "year"],
            how="left",
        )
        test = test.join(
-            index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
+            index.select(
+                "sector", "year", pl.col("log_index").alias("log_index_actual")
+            ),
            left_on=["sector", "actual_year"],
            right_on=["sector", "year"],
            how="left",
@ -121,7 +151,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        (
            pl.col("input_price").cast(pl.Float64)
            * (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
-        ).fill_null(pl.col("input_price").cast(pl.Float64)).alias("predicted"),
+        )
+        .fill_null(pl.col("input_price").cast(pl.Float64))
+        .alias("predicted"),
    )
    return test

@ -150,7 +182,15 @@ def print_metrics_table(metrics_by_stage: dict):
    print("BACKTEST RESULTS")
    print("=" * 55)

-    metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
+    metric_names = [
+        "MdAPE (%)",
+        "% within 10%",
+        "% within 20%",
+        "% within 30%",
+        "MAE (£)",
+        "Mean signed error (£)",
+        "n",
+    ]
    stages = list(metrics_by_stage.keys())

    header = f"{'Metric':<25s}"
@ -176,20 +216,37 @@ def print_metrics_table(metrics_by_stage: dict):

 def main():
    parser = argparse.ArgumentParser(description="Backtest price estimation model")
-    parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
-    parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
-    parser.add_argument("--output", type=Path, required=True, help="Output backtest_results.parquet")
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output backtest_results.parquet"
+    )
+    parser.add_argument(
+        "--hedonic-model",
+        type=Path,
+        default=None,
+        help="Path to hedonic_model.json (optional)",
+    )
    args = parser.parse_args()

    index = pl.read_parquet(args.index)
    has_type_group = "type_group" in index.columns
    if has_type_group:
-        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
-              f"{index['type_group'].n_unique()} type groups")
+        print(
+            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+            f"{index['type_group'].n_unique()} type groups"
+        )
    else:
-        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
+        print(
+            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
+        )

-    test = extract_test_set(args.input)
+    has_hedonic = args.hedonic_model is not None
+    test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)

    print("\nPredicting with price index...")
    test = predict(test, index)
@ -197,19 +254,126 @@ def main():
    # Compute and print metrics
    actual = test["actual_price"].to_numpy().astype(np.float64)
    metrics = {
-        "Naive": compute_metrics(actual, test["input_price"].to_numpy().astype(np.float64)),
-        "Index": compute_metrics(actual, test["predicted"].to_numpy().astype(np.float64)),
+        "Naive": compute_metrics(
+            actual, test["input_price"].to_numpy().astype(np.float64)
+        ),
+        "Index": compute_metrics(
+            actual, test["predicted"].to_numpy().astype(np.float64)
+        ),
    }

+    # Hedonic blending
+    if has_hedonic:
+        print("\nApplying hedonic blending...")
+        with open(args.hedonic_model) as f:
+            model = json.load(f)
+        type_models = model["type_models"]
+
+        # Identify eligible rows for hedonic estimate
+        hedonic_mask = (
+            pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & pl.col("type_group").is_not_null()
+        )
+        eligible_mask = test.select(hedonic_mask).to_series()
+        eligible = test.filter(eligible_mask)
+
+        if len(eligible) > 0:
+            log_fa = np.log(
+                np.maximum(
+                    eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
+                    1.0,
+                )
+            )
+            sectors = eligible["sector"].to_list()
+            types = eligible["type_group"].to_list()
+
+            # Per-type hedonic prediction
+            log_hedonic = np.empty(len(eligible))
+            for i in range(len(eligible)):
+                tm = type_models.get(types[i])
+                if tm is None:
+                    log_hedonic[i] = np.nan
+                    continue
+                alpha = tm["sector_intercepts"].get(
+                    sectors[i], tm["national_intercept"]
+                )
+                log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
+
+            valid = np.isfinite(log_hedonic)
+
+            # Hold years: input_year to actual_year (simulating real prediction)
+            input_years = eligible["input_year"].to_numpy().astype(np.float64)
+            actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
+            hold_years = np.maximum(actual_years - input_years, 0.0)
+
+            log_index_pred = np.log(
+                np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
+            )
+
+            # Sweep tau values (only on valid hedonic rows)
+            tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
+            actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
+            best_tau = 15.0
+            best_mdape = float("inf")
+
+            print(f"\n  tau sweep ({valid.sum():,} eligible properties):")
+            for tau in tau_values:
+                blend_w = hold_years / (hold_years + tau)
+                log_blended = np.where(
+                    valid,
+                    (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
+                    log_index_pred,
+                )
+                blended = np.exp(log_blended)
+                m = compute_metrics(actual_eligible, blended)
+                marker = ""
+                if m["MdAPE (%)"] < best_mdape:
+                    best_mdape = m["MdAPE (%)"]
+                    best_tau = tau
+                    marker = " <-- best"
+                print(
+                    f"    tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
+                    f"within 10%={m['% within 10%']:>5.1f}%{marker}"
+                )
+
+            print(f"\n  Best tau = {best_tau}")
+
+            # Compute blended predictions with best tau for full test set
+            blend_w = hold_years / (hold_years + best_tau)
+            log_blended = np.where(
+                valid,
+                (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
+                log_index_pred,
+            )
+            blended_eligible = np.exp(log_blended)
+
+            # Merge back: for non-eligible rows, use index prediction
+            blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
+            eligible_indices = eligible_mask.arg_true()
+            for i, idx in enumerate(eligible_indices):
+                blended_all[idx] = blended_eligible[i]
+
+            test = test.with_columns(
+                pl.Series("blended", blended_all, dtype=pl.Float64),
+            )
+            metrics["Blended"] = compute_metrics(actual, blended_all)
+
    print_metrics_table(metrics)

    # Save results
-    result = test.select(
-        "Postcode", "sector",
-        "input_year", "input_price",
-        "actual_year", "actual_price",
+    result_cols = [
+        "Postcode",
+        "sector",
+        "input_year",
+        "input_price",
+        "actual_year",
+        "actual_price",
        "predicted",
-    )
+    ]
+    if "blended" in test.columns:
+        result_cols.append("blended")
+    result = test.select(result_cols)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)