{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Price Estimation Model Evaluation\n", "\n", "Evaluates the repeat-sales price index model that adjusts each property's\n", "last known sale price to the current year using a hierarchical postcode-sector index." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "\n", "pl.Config.set_tbl_rows(20)\n", "pl.Config.set_fmt_str_lengths(50)\n", "\n", "DATA_DIR = \"../property-data\"\n", "\n", "index_df = pl.read_parquet(f\"{DATA_DIR}/price_index.parquet\")\n", "estimates_df = pl.read_parquet(f\"{DATA_DIR}/estimated_prices.parquet\")\n", "backtest_df = pl.read_parquet(f\"{DATA_DIR}/backtest_results.parquet\")\n", "\n", "print(f\"Index: {len(index_df):,} rows, {index_df['sector'].n_unique():,} sectors\")\n", "print(f\"Estimates: {len(estimates_df):,} rows\")\n", "print(f\"Backtest: {len(backtest_df):,} rows\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Summary Metrics Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compute_metrics(actual, predicted):\n", " valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)\n", " a, p = actual[valid], predicted[valid]\n", " ape = np.abs(p - a) / a\n", " err = p - a\n", " return {\n", " \"MdAPE (%)\": f\"{np.median(ape)*100:.1f}\",\n", " \"% within 10%\": f\"{np.mean(ape <= 0.10)*100:.1f}\",\n", " \"% within 20%\": f\"{np.mean(ape <= 0.20)*100:.1f}\",\n", " \"% within 30%\": f\"{np.mean(ape <= 0.30)*100:.1f}\",\n", " \"MAE (\\u00a3)\": f\"{np.mean(np.abs(err)):,.0f}\",\n", " \"Mean signed error (\\u00a3)\": f\"{np.mean(err):+,.0f}\",\n", " \"n\": f\"{len(a):,}\",\n", " }\n", "\n", "actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n", "metrics = {\n", " \"Naive\": compute_metrics(actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)),\n", " \"Index\": compute_metrics(actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)),\n", "}\n", "\n", "metrics_table = pl.DataFrame([\n", " {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n", " for k in list(metrics[\"Naive\"].keys())\n", "])\n", "metrics_table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. National + Sample Sector Index Curves" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# National index (average across all sectors weighted by n_pairs)\n", "national = (\n", " index_df\n", " .group_by(\"year\")\n", " .agg(\n", " (pl.col(\"log_index\") * pl.col(\"n_pairs\")).sum() / pl.col(\"n_pairs\").sum(),\n", " )\n", " .sort(\"year\")\n", " .with_columns(pl.lit(\"National\").alias(\"sector\"))\n", ")\n", "\n", "# Sample sectors: London, Manchester, rural, sparse\n", "sample_sectors = [\"EC1A 1\", \"SW1A 1\", \"M1 1\", \"LL55 4\"]\n", "available_sectors = index_df[\"sector\"].unique().to_list()\n", "sample_sectors = [s for s in sample_sectors if s in available_sectors]\n", "\n", "# If not enough, pick some with high/low n_pairs\n", "if len(sample_sectors) < 3:\n", " sector_counts = index_df.group_by(\"sector\").agg(pl.col(\"n_pairs\").first()).sort(\"n_pairs\", descending=True)\n", " top = sector_counts.head(2)[\"sector\"].to_list()\n", " bottom = sector_counts.filter(pl.col(\"n_pairs\") > 0).tail(2)[\"sector\"].to_list()\n", " sample_sectors = list(set(sample_sectors + top + bottom))[:5]\n", "\n", "samples = index_df.filter(pl.col(\"sector\").is_in(sample_sectors))\n", "\n", "combined = pl.concat([national.select(\"sector\", \"year\", \"log_index\"), samples.select(\"sector\", \"year\", \"log_index\")])\n", "\n", "# Normalize: index = 100 at base year (earliest available)\n", "combined = combined.with_columns(\n", " (pl.col(\"log_index\").exp() * 100).alias(\"index_100\"),\n", ")\n", "\n", "fig = px.line(\n", " combined.to_pandas(), x=\"year\", y=\"index_100\", color=\"sector\",\n", " title=\"Repeat-Sales Price Index (base year = 100)\",\n", " labels={\"index_100\": \"Index (base=100)\", \"year\": \"Year\"},\n", ")\n", "fig.update_layout(height=500)\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. APE Distribution: Naive vs Index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n", "\n", "fig = go.Figure()\n", "for label, col in [(\"Naive\", \"input_price\"), (\"Index\", \"predicted\")]:\n", " pred = backtest_df[col].to_numpy().astype(np.float64)\n", " valid = np.isfinite(pred) & (actual > 0)\n", " ape = np.abs(pred[valid] - actual[valid]) / actual[valid]\n", " ape = ape[ape <= 1.0] # clip for display\n", " fig.add_trace(go.Histogram(x=ape * 100, name=label, opacity=0.6, nbinsx=100))\n", "\n", "fig.update_layout(\n", " title=\"Absolute Percentage Error Distribution\",\n", " xaxis_title=\"APE (%)\", yaxis_title=\"Count\",\n", " barmode=\"overlay\", height=500,\n", ")\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Predicted vs Actual Scatter (log-log)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", "n_sample = min(10_000, len(backtest_df))\n", "idx = np.random.choice(len(backtest_df), n_sample, replace=False)\n", "sample = backtest_df[idx.tolist()]\n", "\n", "actual_sample = sample[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred = sample[\"predicted\"].to_numpy().astype(np.float64)\n", "\n", "fig = go.Figure()\n", "fig.add_trace(go.Scattergl(\n", " x=actual_sample, y=pred, mode=\"markers\",\n", " marker=dict(size=2, opacity=0.3), name=\"Index\",\n", "))\n", "# 45-degree reference line\n", "min_val = max(10_000, min(actual_sample.min(), np.nanmin(pred)))\n", "max_val = min(5_000_000, max(actual_sample.max(), np.nanmax(pred)))\n", "fig.add_trace(go.Scatter(\n", " x=[min_val, max_val], y=[min_val, max_val],\n", " mode=\"lines\", line=dict(color=\"red\", dash=\"dash\"), showlegend=False,\n", "))\n", "fig.update_xaxes(type=\"log\", title_text=\"Actual (\\u00a3)\")\n", "fig.update_yaxes(type=\"log\", title_text=\"Predicted (\\u00a3)\")\n", "fig.update_layout(title=\"Predicted vs Actual Price (log scale, 10K sample)\", height=500)\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. MdAPE by Price Band" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bands = [\n", " (0, 100_000, \"<100K\"),\n", " (100_000, 200_000, \"100-200K\"),\n", " (200_000, 300_000, \"200-300K\"),\n", " (300_000, 500_000, \"300-500K\"),\n", " (500_000, 1_000_000, \"500K-1M\"),\n", " (1_000_000, float(\"inf\"), \"1M+\"),\n", "]\n", "\n", "actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred = backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n", "naive = backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n", "\n", "rows = []\n", "for lo, hi, label in bands:\n", " mask = (actual >= lo) & (actual < hi)\n", " if mask.sum() == 0:\n", " continue\n", " for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n", " ape = np.abs(arr[mask] - actual[mask]) / actual[mask]\n", " valid = np.isfinite(ape)\n", " rows.append({\"Price Band\": label, \"Method\": name, \"MdAPE (%)\": float(np.median(ape[valid]) * 100)})\n", "\n", "band_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", " band_df.to_pandas(), x=\"Price Band\", y=\"MdAPE (%)\", color=\"Method\",\n", " barmode=\"group\", title=\"MdAPE by Price Band\",\n", " category_orders={\"Price Band\": [b[2] for b in bands]},\n", ")\n", "fig.update_layout(height=450)\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. MdAPE by Region (Top 20 Postcode Areas)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bt = backtest_df.with_columns(\n", " pl.col(\"sector\").str.replace(r\"\\d.*$\", \"\").alias(\"area\"),\n", ")\n", "\n", "# Top 20 areas by volume\n", "top_areas = bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n", "\n", "actual_np = bt[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred_np = bt[\"predicted\"].to_numpy().astype(np.float64)\n", "naive_np = bt[\"input_price\"].to_numpy().astype(np.float64)\n", "area_np = bt[\"area\"].to_numpy()\n", "\n", "rows = []\n", "for area in top_areas:\n", " mask = area_np == area\n", " a = actual_np[mask]\n", " for name, arr in [(\"Naive\", naive_np), (\"Index\", pred_np)]:\n", " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", " rows.append({\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", "\n", "area_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", " area_df.to_pandas(), x=\"Area\", y=\"MdAPE (%)\", color=\"Method\",\n", " barmode=\"group\", title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n", " category_orders={\"Area\": top_areas},\n", ")\n", "fig.update_layout(height=500)\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. MdAPE by Holding Period" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bt = backtest_df.with_columns(\n", " (pl.col(\"actual_year\") - pl.col(\"input_year\")).alias(\"gap_years\"),\n", ")\n", "\n", "actual = bt[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred = bt[\"predicted\"].to_numpy().astype(np.float64)\n", "naive = bt[\"input_price\"].to_numpy().astype(np.float64)\n", "gaps = bt[\"gap_years\"].to_numpy()\n", "max_gap = min(15, int(gaps.max()))\n", "\n", "rows = []\n", "for gap in range(1, max_gap + 1):\n", " mask = gaps == gap\n", " if mask.sum() < 100:\n", " continue\n", " a = actual[mask]\n", " for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n", " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", " rows.append({\"Gap (years)\": gap, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", "\n", "gap_df = pl.DataFrame(rows)\n", "fig = px.line(\n", " gap_df.to_pandas(), x=\"Gap (years)\", y=\"MdAPE (%)\", color=\"Method\",\n", " title=\"MdAPE by Holding Period (years between input and actual sale)\",\n", " markers=True,\n", ")\n", "fig.update_layout(height=450)\n", "fig.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 4 }