Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -54,25 +54,32 @@
|
|||
" ape = np.abs(p - a) / a\n",
|
||||
" err = p - a\n",
|
||||
" return {\n",
|
||||
" \"MdAPE (%)\": f\"{np.median(ape)*100:.1f}\",\n",
|
||||
" \"% within 10%\": f\"{np.mean(ape <= 0.10)*100:.1f}\",\n",
|
||||
" \"% within 20%\": f\"{np.mean(ape <= 0.20)*100:.1f}\",\n",
|
||||
" \"% within 30%\": f\"{np.mean(ape <= 0.30)*100:.1f}\",\n",
|
||||
" \"MdAPE (%)\": f\"{np.median(ape) * 100:.1f}\",\n",
|
||||
" \"% within 10%\": f\"{np.mean(ape <= 0.10) * 100:.1f}\",\n",
|
||||
" \"% within 20%\": f\"{np.mean(ape <= 0.20) * 100:.1f}\",\n",
|
||||
" \"% within 30%\": f\"{np.mean(ape <= 0.30) * 100:.1f}\",\n",
|
||||
" \"MAE (\\u00a3)\": f\"{np.mean(np.abs(err)):,.0f}\",\n",
|
||||
" \"Mean signed error (\\u00a3)\": f\"{np.mean(err):+,.0f}\",\n",
|
||||
" \"n\": f\"{len(a):,}\",\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n",
|
||||
"metrics = {\n",
|
||||
" \"Naive\": compute_metrics(actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)),\n",
|
||||
" \"Index\": compute_metrics(actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)),\n",
|
||||
" \"Naive\": compute_metrics(\n",
|
||||
" actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n",
|
||||
" ),\n",
|
||||
" \"Index\": compute_metrics(\n",
|
||||
" actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"metrics_table = pl.DataFrame([\n",
|
||||
" {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n",
|
||||
" for k in list(metrics[\"Naive\"].keys())\n",
|
||||
"])\n",
|
||||
"metrics_table = pl.DataFrame(\n",
|
||||
" [\n",
|
||||
" {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n",
|
||||
" for k in list(metrics[\"Naive\"].keys())\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"metrics_table"
|
||||
]
|
||||
},
|
||||
|
|
@ -91,8 +98,7 @@
|
|||
"source": [
|
||||
"# National index (average across all sectors weighted by n_pairs)\n",
|
||||
"national = (\n",
|
||||
" index_df\n",
|
||||
" .group_by(\"year\")\n",
|
||||
" index_df.group_by(\"year\")\n",
|
||||
" .agg(\n",
|
||||
" (pl.col(\"log_index\") * pl.col(\"n_pairs\")).sum() / pl.col(\"n_pairs\").sum(),\n",
|
||||
" )\n",
|
||||
|
|
@ -107,14 +113,23 @@
|
|||
"\n",
|
||||
"# If not enough, pick some with high/low n_pairs\n",
|
||||
"if len(sample_sectors) < 3:\n",
|
||||
" sector_counts = index_df.group_by(\"sector\").agg(pl.col(\"n_pairs\").first()).sort(\"n_pairs\", descending=True)\n",
|
||||
" sector_counts = (\n",
|
||||
" index_df.group_by(\"sector\")\n",
|
||||
" .agg(pl.col(\"n_pairs\").first())\n",
|
||||
" .sort(\"n_pairs\", descending=True)\n",
|
||||
" )\n",
|
||||
" top = sector_counts.head(2)[\"sector\"].to_list()\n",
|
||||
" bottom = sector_counts.filter(pl.col(\"n_pairs\") > 0).tail(2)[\"sector\"].to_list()\n",
|
||||
" sample_sectors = list(set(sample_sectors + top + bottom))[:5]\n",
|
||||
"\n",
|
||||
"samples = index_df.filter(pl.col(\"sector\").is_in(sample_sectors))\n",
|
||||
"\n",
|
||||
"combined = pl.concat([national.select(\"sector\", \"year\", \"log_index\"), samples.select(\"sector\", \"year\", \"log_index\")])\n",
|
||||
"combined = pl.concat(\n",
|
||||
" [\n",
|
||||
" national.select(\"sector\", \"year\", \"log_index\"),\n",
|
||||
" samples.select(\"sector\", \"year\", \"log_index\"),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Normalize: index = 100 at base year (earliest available)\n",
|
||||
"combined = combined.with_columns(\n",
|
||||
|
|
@ -122,7 +137,10 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"fig = px.line(\n",
|
||||
" combined.to_pandas(), x=\"year\", y=\"index_100\", color=\"sector\",\n",
|
||||
" combined.to_pandas(),\n",
|
||||
" x=\"year\",\n",
|
||||
" y=\"index_100\",\n",
|
||||
" color=\"sector\",\n",
|
||||
" title=\"Repeat-Sales Price Index (base year = 100)\",\n",
|
||||
" labels={\"index_100\": \"Index (base=100)\", \"year\": \"Year\"},\n",
|
||||
")\n",
|
||||
|
|
@ -155,8 +173,10 @@
|
|||
"\n",
|
||||
"fig.update_layout(\n",
|
||||
" title=\"Absolute Percentage Error Distribution\",\n",
|
||||
" xaxis_title=\"APE (%)\", yaxis_title=\"Count\",\n",
|
||||
" barmode=\"overlay\", height=500,\n",
|
||||
" xaxis_title=\"APE (%)\",\n",
|
||||
" yaxis_title=\"Count\",\n",
|
||||
" barmode=\"overlay\",\n",
|
||||
" height=500,\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -183,17 +203,27 @@
|
|||
"pred = sample[\"predicted\"].to_numpy().astype(np.float64)\n",
|
||||
"\n",
|
||||
"fig = go.Figure()\n",
|
||||
"fig.add_trace(go.Scattergl(\n",
|
||||
" x=actual_sample, y=pred, mode=\"markers\",\n",
|
||||
" marker=dict(size=2, opacity=0.3), name=\"Index\",\n",
|
||||
"))\n",
|
||||
"fig.add_trace(\n",
|
||||
" go.Scattergl(\n",
|
||||
" x=actual_sample,\n",
|
||||
" y=pred,\n",
|
||||
" mode=\"markers\",\n",
|
||||
" marker=dict(size=2, opacity=0.3),\n",
|
||||
" name=\"Index\",\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"# 45-degree reference line\n",
|
||||
"min_val = max(10_000, min(actual_sample.min(), np.nanmin(pred)))\n",
|
||||
"max_val = min(5_000_000, max(actual_sample.max(), np.nanmax(pred)))\n",
|
||||
"fig.add_trace(go.Scatter(\n",
|
||||
" x=[min_val, max_val], y=[min_val, max_val],\n",
|
||||
" mode=\"lines\", line=dict(color=\"red\", dash=\"dash\"), showlegend=False,\n",
|
||||
"))\n",
|
||||
"fig.add_trace(\n",
|
||||
" go.Scatter(\n",
|
||||
" x=[min_val, max_val],\n",
|
||||
" y=[min_val, max_val],\n",
|
||||
" mode=\"lines\",\n",
|
||||
" line=dict(color=\"red\", dash=\"dash\"),\n",
|
||||
" showlegend=False,\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"fig.update_xaxes(type=\"log\", title_text=\"Actual (\\u00a3)\")\n",
|
||||
"fig.update_yaxes(type=\"log\", title_text=\"Predicted (\\u00a3)\")\n",
|
||||
"fig.update_layout(title=\"Predicted vs Actual Price (log scale, 10K sample)\", height=500)\n",
|
||||
|
|
@ -234,12 +264,22 @@
|
|||
" for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n",
|
||||
" ape = np.abs(arr[mask] - actual[mask]) / actual[mask]\n",
|
||||
" valid = np.isfinite(ape)\n",
|
||||
" rows.append({\"Price Band\": label, \"Method\": name, \"MdAPE (%)\": float(np.median(ape[valid]) * 100)})\n",
|
||||
" rows.append(\n",
|
||||
" {\n",
|
||||
" \"Price Band\": label,\n",
|
||||
" \"Method\": name,\n",
|
||||
" \"MdAPE (%)\": float(np.median(ape[valid]) * 100),\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"band_df = pl.DataFrame(rows)\n",
|
||||
"fig = px.bar(\n",
|
||||
" band_df.to_pandas(), x=\"Price Band\", y=\"MdAPE (%)\", color=\"Method\",\n",
|
||||
" barmode=\"group\", title=\"MdAPE by Price Band\",\n",
|
||||
" band_df.to_pandas(),\n",
|
||||
" x=\"Price Band\",\n",
|
||||
" y=\"MdAPE (%)\",\n",
|
||||
" color=\"Method\",\n",
|
||||
" barmode=\"group\",\n",
|
||||
" title=\"MdAPE by Price Band\",\n",
|
||||
" category_orders={\"Price Band\": [b[2] for b in bands]},\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=450)\n",
|
||||
|
|
@ -264,7 +304,9 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"# Top 20 areas by volume\n",
|
||||
"top_areas = bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n",
|
||||
"top_areas = (\n",
|
||||
" bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"actual_np = bt[\"actual_price\"].to_numpy().astype(np.float64)\n",
|
||||
"pred_np = bt[\"predicted\"].to_numpy().astype(np.float64)\n",
|
||||
|
|
@ -279,12 +321,18 @@
|
|||
" p = arr[mask]\n",
|
||||
" valid = np.isfinite(p) & (a > 0)\n",
|
||||
" ape = np.abs(p[valid] - a[valid]) / a[valid]\n",
|
||||
" rows.append({\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n",
|
||||
" rows.append(\n",
|
||||
" {\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)}\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"area_df = pl.DataFrame(rows)\n",
|
||||
"fig = px.bar(\n",
|
||||
" area_df.to_pandas(), x=\"Area\", y=\"MdAPE (%)\", color=\"Method\",\n",
|
||||
" barmode=\"group\", title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n",
|
||||
" area_df.to_pandas(),\n",
|
||||
" x=\"Area\",\n",
|
||||
" y=\"MdAPE (%)\",\n",
|
||||
" color=\"Method\",\n",
|
||||
" barmode=\"group\",\n",
|
||||
" title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n",
|
||||
" category_orders={\"Area\": top_areas},\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=500)\n",
|
||||
|
|
@ -324,11 +372,20 @@
|
|||
" p = arr[mask]\n",
|
||||
" valid = np.isfinite(p) & (a > 0)\n",
|
||||
" ape = np.abs(p[valid] - a[valid]) / a[valid]\n",
|
||||
" rows.append({\"Gap (years)\": gap, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n",
|
||||
" rows.append(\n",
|
||||
" {\n",
|
||||
" \"Gap (years)\": gap,\n",
|
||||
" \"Method\": name,\n",
|
||||
" \"MdAPE (%)\": float(np.median(ape) * 100),\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"gap_df = pl.DataFrame(rows)\n",
|
||||
"fig = px.line(\n",
|
||||
" gap_df.to_pandas(), x=\"Gap (years)\", y=\"MdAPE (%)\", color=\"Method\",\n",
|
||||
" gap_df.to_pandas(),\n",
|
||||
" x=\"Gap (years)\",\n",
|
||||
" y=\"MdAPE (%)\",\n",
|
||||
" color=\"Method\",\n",
|
||||
" title=\"MdAPE by Holding Period (years between input and actual sale)\",\n",
|
||||
" markers=True,\n",
|
||||
")\n",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue