perfect-postcode/analyses/price_model_evaluation.ipynb
2026-03-15 21:22:28 +00:00

410 lines
13 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Price Estimation Model Evaluation\n",
"\n",
"Evaluates the repeat-sales price index model that adjusts each property's\n",
"last known sale price to the current year using a hierarchical postcode-sector index."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"pl.Config.set_tbl_rows(20)\n",
"pl.Config.set_fmt_str_lengths(50)\n",
"\n",
"DATA_DIR = \"../property-data\"\n",
"\n",
"index_df = pl.read_parquet(f\"{DATA_DIR}/price_index.parquet\")\n",
"estimates_df = pl.read_parquet(f\"{DATA_DIR}/estimated_prices.parquet\")\n",
"backtest_df = pl.read_parquet(f\"{DATA_DIR}/backtest_results.parquet\")\n",
"\n",
"print(f\"Index: {len(index_df):,} rows, {index_df['sector'].n_unique():,} sectors\")\n",
"print(f\"Estimates: {len(estimates_df):,} rows\")\n",
"print(f\"Backtest: {len(backtest_df):,} rows\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Summary Metrics Table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def compute_metrics(actual, predicted):\n",
" valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)\n",
" a, p = actual[valid], predicted[valid]\n",
" ape = np.abs(p - a) / a\n",
" err = p - a\n",
" return {\n",
" \"MdAPE (%)\": f\"{np.median(ape) * 100:.1f}\",\n",
" \"% within 10%\": f\"{np.mean(ape <= 0.10) * 100:.1f}\",\n",
" \"% within 20%\": f\"{np.mean(ape <= 0.20) * 100:.1f}\",\n",
" \"% within 30%\": f\"{np.mean(ape <= 0.30) * 100:.1f}\",\n",
" \"MAE (\\u00a3)\": f\"{np.mean(np.abs(err)):,.0f}\",\n",
" \"Mean signed error (\\u00a3)\": f\"{np.mean(err):+,.0f}\",\n",
" \"n\": f\"{len(a):,}\",\n",
" }\n",
"\n",
"\n",
"actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n",
"metrics = {\n",
" \"Naive\": compute_metrics(\n",
" actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n",
" ),\n",
" \"Index\": compute_metrics(\n",
" actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n",
" ),\n",
"}\n",
"\n",
"metrics_table = pl.DataFrame(\n",
" [\n",
" {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n",
" for k in list(metrics[\"Naive\"].keys())\n",
" ]\n",
")\n",
"metrics_table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. National + Sample Sector Index Curves"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# National index (average across all sectors weighted by n_pairs)\n",
"national = (\n",
" index_df.group_by(\"year\")\n",
" .agg(\n",
" (pl.col(\"log_index\") * pl.col(\"n_pairs\")).sum() / pl.col(\"n_pairs\").sum(),\n",
" )\n",
" .sort(\"year\")\n",
" .with_columns(pl.lit(\"National\").alias(\"sector\"))\n",
")\n",
"\n",
"# Sample sectors: London, Manchester, rural, sparse\n",
"sample_sectors = [\"EC1A 1\", \"SW1A 1\", \"M1 1\", \"LL55 4\"]\n",
"available_sectors = index_df[\"sector\"].unique().to_list()\n",
"sample_sectors = [s for s in sample_sectors if s in available_sectors]\n",
"\n",
"# If not enough, pick some with high/low n_pairs\n",
"if len(sample_sectors) < 3:\n",
" sector_counts = (\n",
" index_df.group_by(\"sector\")\n",
" .agg(pl.col(\"n_pairs\").first())\n",
" .sort(\"n_pairs\", descending=True)\n",
" )\n",
" top = sector_counts.head(2)[\"sector\"].to_list()\n",
" bottom = sector_counts.filter(pl.col(\"n_pairs\") > 0).tail(2)[\"sector\"].to_list()\n",
" sample_sectors = list(set(sample_sectors + top + bottom))[:5]\n",
"\n",
"samples = index_df.filter(pl.col(\"sector\").is_in(sample_sectors))\n",
"\n",
"combined = pl.concat(\n",
" [\n",
" national.select(\"sector\", \"year\", \"log_index\"),\n",
" samples.select(\"sector\", \"year\", \"log_index\"),\n",
" ]\n",
")\n",
"\n",
"# Normalize: index = 100 at base year (earliest available)\n",
"combined = combined.with_columns(\n",
" (pl.col(\"log_index\").exp() * 100).alias(\"index_100\"),\n",
")\n",
"\n",
"fig = px.line(\n",
" combined.to_pandas(),\n",
" x=\"year\",\n",
" y=\"index_100\",\n",
" color=\"sector\",\n",
" title=\"Repeat-Sales Price Index (base year = 100)\",\n",
" labels={\"index_100\": \"Index (base=100)\", \"year\": \"Year\"},\n",
")\n",
"fig.update_layout(height=500)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. APE Distribution: Naive vs Index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n",
"\n",
"fig = go.Figure()\n",
"for label, col in [(\"Naive\", \"input_price\"), (\"Index\", \"predicted\")]:\n",
" pred = backtest_df[col].to_numpy().astype(np.float64)\n",
" valid = np.isfinite(pred) & (actual > 0)\n",
" ape = np.abs(pred[valid] - actual[valid]) / actual[valid]\n",
" ape = ape[ape <= 1.0] # clip for display\n",
" fig.add_trace(go.Histogram(x=ape * 100, name=label, opacity=0.6, nbinsx=100))\n",
"\n",
"fig.update_layout(\n",
" title=\"Absolute Percentage Error Distribution\",\n",
" xaxis_title=\"APE (%)\",\n",
" yaxis_title=\"Count\",\n",
" barmode=\"overlay\",\n",
" height=500,\n",
")\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Predicted vs Actual Scatter (log-log)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)\n",
"n_sample = min(10_000, len(backtest_df))\n",
"idx = np.random.choice(len(backtest_df), n_sample, replace=False)\n",
"sample = backtest_df[idx.tolist()]\n",
"\n",
"actual_sample = sample[\"actual_price\"].to_numpy().astype(np.float64)\n",
"pred = sample[\"predicted\"].to_numpy().astype(np.float64)\n",
"\n",
"fig = go.Figure()\n",
"fig.add_trace(\n",
" go.Scattergl(\n",
" x=actual_sample,\n",
" y=pred,\n",
" mode=\"markers\",\n",
" marker=dict(size=2, opacity=0.3),\n",
" name=\"Index\",\n",
" )\n",
")\n",
"# 45-degree reference line\n",
"min_val = max(10_000, min(actual_sample.min(), np.nanmin(pred)))\n",
"max_val = min(5_000_000, max(actual_sample.max(), np.nanmax(pred)))\n",
"fig.add_trace(\n",
" go.Scatter(\n",
" x=[min_val, max_val],\n",
" y=[min_val, max_val],\n",
" mode=\"lines\",\n",
" line=dict(color=\"red\", dash=\"dash\"),\n",
" showlegend=False,\n",
" )\n",
")\n",
"fig.update_xaxes(type=\"log\", title_text=\"Actual (\\u00a3)\")\n",
"fig.update_yaxes(type=\"log\", title_text=\"Predicted (\\u00a3)\")\n",
"fig.update_layout(title=\"Predicted vs Actual Price (log scale, 10K sample)\", height=500)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. MdAPE by Price Band"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bands = [\n",
" (0, 100_000, \"<100K\"),\n",
" (100_000, 200_000, \"100-200K\"),\n",
" (200_000, 300_000, \"200-300K\"),\n",
" (300_000, 500_000, \"300-500K\"),\n",
" (500_000, 1_000_000, \"500K-1M\"),\n",
" (1_000_000, float(\"inf\"), \"1M+\"),\n",
"]\n",
"\n",
"actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n",
"pred = backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n",
"naive = backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n",
"\n",
"rows = []\n",
"for lo, hi, label in bands:\n",
" mask = (actual >= lo) & (actual < hi)\n",
" if mask.sum() == 0:\n",
" continue\n",
" for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n",
" ape = np.abs(arr[mask] - actual[mask]) / actual[mask]\n",
" valid = np.isfinite(ape)\n",
" rows.append(\n",
" {\n",
" \"Price Band\": label,\n",
" \"Method\": name,\n",
" \"MdAPE (%)\": float(np.median(ape[valid]) * 100),\n",
" }\n",
" )\n",
"\n",
"band_df = pl.DataFrame(rows)\n",
"fig = px.bar(\n",
" band_df.to_pandas(),\n",
" x=\"Price Band\",\n",
" y=\"MdAPE (%)\",\n",
" color=\"Method\",\n",
" barmode=\"group\",\n",
" title=\"MdAPE by Price Band\",\n",
" category_orders={\"Price Band\": [b[2] for b in bands]},\n",
")\n",
"fig.update_layout(height=450)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. MdAPE by Region (Top 20 Postcode Areas)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bt = backtest_df.with_columns(\n",
" pl.col(\"sector\").str.replace(r\"\\d.*$\", \"\").alias(\"area\"),\n",
")\n",
"\n",
"# Top 20 areas by volume\n",
"top_areas = (\n",
" bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n",
")\n",
"\n",
"actual_np = bt[\"actual_price\"].to_numpy().astype(np.float64)\n",
"pred_np = bt[\"predicted\"].to_numpy().astype(np.float64)\n",
"naive_np = bt[\"input_price\"].to_numpy().astype(np.float64)\n",
"area_np = bt[\"area\"].to_numpy()\n",
"\n",
"rows = []\n",
"for area in top_areas:\n",
" mask = area_np == area\n",
" a = actual_np[mask]\n",
" for name, arr in [(\"Naive\", naive_np), (\"Index\", pred_np)]:\n",
" p = arr[mask]\n",
" valid = np.isfinite(p) & (a > 0)\n",
" ape = np.abs(p[valid] - a[valid]) / a[valid]\n",
" rows.append(\n",
" {\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)}\n",
" )\n",
"\n",
"area_df = pl.DataFrame(rows)\n",
"fig = px.bar(\n",
" area_df.to_pandas(),\n",
" x=\"Area\",\n",
" y=\"MdAPE (%)\",\n",
" color=\"Method\",\n",
" barmode=\"group\",\n",
" title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n",
" category_orders={\"Area\": top_areas},\n",
")\n",
"fig.update_layout(height=500)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. MdAPE by Holding Period"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bt = backtest_df.with_columns(\n",
" (pl.col(\"actual_year\") - pl.col(\"input_year\")).alias(\"gap_years\"),\n",
")\n",
"\n",
"actual = bt[\"actual_price\"].to_numpy().astype(np.float64)\n",
"pred = bt[\"predicted\"].to_numpy().astype(np.float64)\n",
"naive = bt[\"input_price\"].to_numpy().astype(np.float64)\n",
"gaps = bt[\"gap_years\"].to_numpy()\n",
"max_gap = min(15, int(gaps.max()))\n",
"\n",
"rows = []\n",
"for gap in range(1, max_gap + 1):\n",
" mask = gaps == gap\n",
" if mask.sum() < 100:\n",
" continue\n",
" a = actual[mask]\n",
" for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n",
" p = arr[mask]\n",
" valid = np.isfinite(p) & (a > 0)\n",
" ape = np.abs(p[valid] - a[valid]) / a[valid]\n",
" rows.append(\n",
" {\n",
" \"Gap (years)\": gap,\n",
" \"Method\": name,\n",
" \"MdAPE (%)\": float(np.median(ape) * 100),\n",
" }\n",
" )\n",
"\n",
"gap_df = pl.DataFrame(rows)\n",
"fig = px.line(\n",
" gap_df.to_pandas(),\n",
" x=\"Gap (years)\",\n",
" y=\"MdAPE (%)\",\n",
" color=\"Method\",\n",
" title=\"MdAPE by Holding Period (years between input and actual sale)\",\n",
" markers=True,\n",
")\n",
"fig.update_layout(height=450)\n",
"fig.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}