Changes

2026-02-08 18:40:17 +00:00 · 2026-02-08 18:40:17 +00:00 · 5b68c8da04
commit 5b68c8da04
parent 6c90cf3c0f
14 changed files with 687 additions and 551 deletions
--- a/pipeline/transform/price_backtest.py
+++ b/pipeline/transform/price_backtest.py
@ -3,6 +3,7 @@
 Test set: properties with 2+ sales where the last sale is 2022-2025.
 Uses the second-to-last sale as input, predicts the last sale price.
 Compares index-based prediction against a naive baseline (raw input price).
+Uses type-stratified index when available, falling back to "All" type.

 Output: backtest_results.parquet with predictions vs actuals.
 """
@ -15,6 +16,17 @@ import polars as pl

 CURRENT_YEAR = 2025
 TEST_YEAR_MIN = 2022
+TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
+
+
+def type_group_expr():
+    return (
+        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
+        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
+        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
+        .otherwise(pl.lit(None))
+        .alias("type_group")
+    )


 def extract_test_set(input_path: Path) -> pl.DataFrame:
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
    print("Loading test set...")
    df = (
        pl.scan_parquet(input_path)
-        .select("Postcode", "historical_prices")
+        .select("Postcode", "historical_prices", "Property type")
        .filter(
            pl.col("Postcode").is_not_null(),
            pl.col("historical_prices").list.len() >= 2,
        )
        .with_columns(
            pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
+            type_group_expr(),
            # Last sale (ground truth)
            pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
            pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:


 def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
-    """Index-based prediction: adjust input price by sector index change."""
-    # Join index at input year
-    test = test.join(
-        index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
-        left_on=["sector", "input_year"],
-        right_on=["sector", "year"],
-        how="left",
-    )
-    # Join index at actual year
-    test = test.join(
-        index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
-        left_on=["sector", "actual_year"],
-        right_on=["sector", "year"],
-        how="left",
-    )
+    """Index-based prediction with type-stratified fallback."""
+    has_type_group = "type_group" in index.columns
+
+    if has_type_group:
+        idx_typed = index.filter(pl.col("type_group") != "All")
+        idx_all = index.filter(pl.col("type_group") == "All")
+
+        # Join type-specific index at input year
+        test = test.join(
+            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
+            left_on=["sector", "type_group", "input_year"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        )
+        # Join "All" index at input year
+        test = test.join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
+            left_on=["sector", "input_year"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+        # Join type-specific index at actual year
+        test = test.join(
+            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
+            left_on=["sector", "type_group", "actual_year"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        )
+        # Join "All" index at actual year
+        test = test.join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
+            left_on=["sector", "actual_year"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+
+        test = test.with_columns(
+            pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
+            pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
+        )
+    else:
+        # Unstratified index
+        test = test.join(
+            index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
+            left_on=["sector", "input_year"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+        test = test.join(
+            index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
+            left_on=["sector", "actual_year"],
+            right_on=["sector", "year"],
+            how="left",
+        )

    test = test.with_columns(
        (
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:


 def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
-    """Compute error metrics."""
    valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
    actual = actual[valid]
    predicted = predicted[valid]
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:


 def print_metrics_table(metrics_by_stage: dict):
-    """Print a comparison table of metrics."""
    print("\n" + "=" * 55)
    print("BACKTEST RESULTS")
    print("=" * 55)
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
    metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
    stages = list(metrics_by_stage.keys())

-    # Header
    header = f"{'Metric':<25s}"
    for stage in stages:
        header += f" {stage:>14s}"
@ -133,7 +182,12 @@ def main():
    args = parser.parse_args()

    index = pl.read_parquet(args.index)
-    print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
+    has_type_group = "type_group" in index.columns
+    if has_type_group:
+        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+              f"{index['type_group'].n_unique()} type groups")
+    else:
+        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")

    test = extract_test_set(args.input)