This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -54,11 +54,15 @@
}
],
"source": [
"r5_bank = pl.read_parquet(\"../property-data/travel-times/transit/000000-bank-tube-station.parquet\")\n",
"r5_bank = pl.read_parquet(\n",
" \"../property-data/travel-times/transit/000000-bank-tube-station.parquet\"\n",
")\n",
"manual_bank = pl.read_parquet(\"../manual-data/journey_times_bank.parquet\")\n",
"\n",
"print(f\"R5 Bank: {r5_bank.shape[0]:,} postcodes\")\n",
"print(f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\")"
"print(\n",
" f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\"\n",
")"
]
},
{
@ -116,25 +120,49 @@
"source": [
"# Join on postcode, keep only rows where both sources have values\n",
"bank = (\n",
" r5_bank\n",
" .join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n",
" r5_bank.join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n",
" .filter(\n",
" pl.col(\"public_transport_easy_minutes\").is_not_null()\n",
" & pl.col(\"public_transport_quick_minutes\").is_not_null()\n",
" )\n",
" .with_columns([\n",
" # Signed error: R5 - Manual (positive = R5 is slower)\n",
" (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).alias(\"error_easy\"),\n",
" (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).alias(\"error_quick\"),\n",
" # Absolute error\n",
" (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_easy\"),\n",
" (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_quick\"),\n",
" ])\n",
" .with_columns(\n",
" [\n",
" # Signed error: R5 - Manual (positive = R5 is slower)\n",
" (\n",
" pl.col(\"travel_minutes\").cast(pl.Float64)\n",
" - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n",
" ).alias(\"error_easy\"),\n",
" (\n",
" pl.col(\"best_minutes\").cast(pl.Float64)\n",
" - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n",
" ).alias(\"error_quick\"),\n",
" # Absolute error\n",
" (\n",
" pl.col(\"travel_minutes\").cast(pl.Float64)\n",
" - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n",
" )\n",
" .abs()\n",
" .alias(\"abs_error_easy\"),\n",
" (\n",
" pl.col(\"best_minutes\").cast(pl.Float64)\n",
" - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n",
" )\n",
" .abs()\n",
" .alias(\"abs_error_quick\"),\n",
" ]\n",
" )\n",
")\n",
"\n",
"print(f\"Joined (non-null): {bank.shape[0]:,} postcodes\")\n",
"bank.select(\"pcds\", \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n",
" \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\").head(10)"
"bank.select(\n",
" \"pcds\",\n",
" \"travel_minutes\",\n",
" \"public_transport_easy_minutes\",\n",
" \"error_easy\",\n",
" \"best_minutes\",\n",
" \"public_transport_quick_minutes\",\n",
" \"error_quick\",\n",
").head(10)"
]
},
{
@ -196,18 +224,23 @@
" percentiles = [5, 25, 50, 80, 90, 95, 99]\n",
" rows = []\n",
" for p in percentiles:\n",
" rows.append({\n",
" \"percentile\": f\"p{p}\",\n",
" f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n",
" f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n",
" })\n",
" rows.append({\n",
" \"percentile\": \"mean\",\n",
" f\"{label} signed error\": round(float(np.mean(col)), 1),\n",
" f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n",
" })\n",
" rows.append(\n",
" {\n",
" \"percentile\": f\"p{p}\",\n",
" f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n",
" f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n",
" }\n",
" )\n",
" rows.append(\n",
" {\n",
" \"percentile\": \"mean\",\n",
" f\"{label} signed error\": round(float(np.mean(col)), 1),\n",
" f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n",
" }\n",
" )\n",
" return pl.DataFrame(rows)\n",
"\n",
"\n",
"stats_easy = percentile_stats(\"error_easy\", \"Median (easy)\")\n",
"stats_quick = percentile_stats(\"error_quick\", \"Best (quick)\")\n",
"\n",
@ -1120,24 +1153,42 @@
}
],
"source": [
"fig = make_subplots(rows=1, cols=2, subplot_titles=[\n",
" \"Median transit time error (R5 TfL)\",\n",
" \"Best transit time error (R5 TfL)\"\n",
"])\n",
"fig = make_subplots(\n",
" rows=1,\n",
" cols=2,\n",
" subplot_titles=[\n",
" \"Median transit time error (R5 TfL)\",\n",
" \"Best transit time error (R5 TfL)\",\n",
" ],\n",
")\n",
"\n",
"# Clip for readability\n",
"easy_clipped = bank[\"error_easy\"].clip(-60, 60).to_numpy()\n",
"quick_clipped = bank[\"error_quick\"].clip(-60, 60).to_numpy()\n",
"\n",
"fig.add_trace(go.Histogram(x=easy_clipped, nbinsx=120, name=\"Median (easy)\",\n",
" marker_color=\"#0d9488\"), row=1, col=1)\n",
"fig.add_trace(go.Histogram(x=quick_clipped, nbinsx=120, name=\"Best (quick)\",\n",
" marker_color=\"#f59e0b\"), row=1, col=2)\n",
"fig.add_trace(\n",
" go.Histogram(\n",
" x=easy_clipped, nbinsx=120, name=\"Median (easy)\", marker_color=\"#0d9488\"\n",
" ),\n",
" row=1,\n",
" col=1,\n",
")\n",
"fig.add_trace(\n",
" go.Histogram(\n",
" x=quick_clipped, nbinsx=120, name=\"Best (quick)\", marker_color=\"#f59e0b\"\n",
" ),\n",
" row=1,\n",
" col=2,\n",
")\n",
"\n",
"fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=1)\n",
"fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=2)\n",
"fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n",
"fig.update_layout(height=400, showlegend=False, title_text=\"Bank: Error Distribution (clipped ±60 min)\")\n",
"fig.update_layout(\n",
" height=400,\n",
" showlegend=False,\n",
" title_text=\"Bank: Error Distribution (clipped ±60 min)\",\n",
")\n",
"fig.show()"
]
},
@ -2104,34 +2155,55 @@
"# Sample for scatter plot performance\n",
"sample = bank.sample(n=min(20_000, bank.shape[0]), seed=42)\n",
"\n",
"fig = make_subplots(rows=1, cols=2, subplot_titles=[\n",
" \"Median: R5 vs TfL (easy)\",\n",
" \"Best: R5 vs TfL (quick)\"\n",
"])\n",
"fig = make_subplots(\n",
" rows=1,\n",
" cols=2,\n",
" subplot_titles=[\"Median: R5 vs TfL (easy)\", \"Best: R5 vs TfL (quick)\"],\n",
")\n",
"\n",
"fig.add_trace(go.Scattergl(\n",
" x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n",
" y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n",
" mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n",
" name=\"Median\"\n",
"), row=1, col=1)\n",
"fig.add_trace(\n",
" go.Scattergl(\n",
" x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n",
" y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n",
" mode=\"markers\",\n",
" marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n",
" name=\"Median\",\n",
" ),\n",
" row=1,\n",
" col=1,\n",
")\n",
"\n",
"fig.add_trace(go.Scattergl(\n",
" x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n",
" y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n",
" mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n",
" name=\"Best\"\n",
"), row=1, col=2)\n",
"fig.add_trace(\n",
" go.Scattergl(\n",
" x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n",
" y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n",
" mode=\"markers\",\n",
" marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n",
" name=\"Best\",\n",
" ),\n",
" row=1,\n",
" col=2,\n",
")\n",
"\n",
"# Perfect agreement line\n",
"for col in [1, 2]:\n",
" fig.add_trace(go.Scatter(x=[0, 200], y=[0, 200], mode=\"lines\",\n",
" line=dict(color=\"red\", dash=\"dash\", width=1),\n",
" showlegend=False), row=1, col=col)\n",
" fig.add_trace(\n",
" go.Scatter(\n",
" x=[0, 200],\n",
" y=[0, 200],\n",
" mode=\"lines\",\n",
" line=dict(color=\"red\", dash=\"dash\", width=1),\n",
" showlegend=False,\n",
" ),\n",
" row=1,\n",
" col=col,\n",
" )\n",
" fig.update_xaxes(title_text=\"TfL API (minutes)\", row=1, col=col)\n",
" fig.update_yaxes(title_text=\"R5 (minutes)\", row=1, col=col)\n",
"\n",
"fig.update_layout(height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\")\n",
"fig.update_layout(\n",
" height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\"\n",
")\n",
"fig.show()"
]
},
@ -403063,7 +403135,8 @@
"\n",
"fig = px.scatter_map(\n",
" map_sample.to_pandas(),\n",
" lat=\"lat\", lon=\"long\",\n",
" lat=\"lat\",\n",
" lon=\"long\",\n",
" color=\"error_easy\",\n",
" color_continuous_scale=\"RdBu_r\", # red=positive (R5 slower), blue=negative (R5 faster)\n",
" range_color=[-30, 30],\n",
@ -403071,8 +403144,14 @@
" center={\"lat\": 51.5, \"lon\": -0.1},\n",
" opacity=0.5,\n",
" title=\"Bank — Median transit error (R5 TfL easy), minutes\",\n",
" hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n",
" \"error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n",
" hover_data={\n",
" \"pcds\": True,\n",
" \"travel_minutes\": True,\n",
" \"public_transport_easy_minutes\": True,\n",
" \"error_easy\": \":.0f\",\n",
" \"lat\": False,\n",
" \"long\": False,\n",
" },\n",
" height=700,\n",
")\n",
"fig.update_layout(map_style=\"carto-positron\")\n",
@ -803994,7 +804073,8 @@
"source": [
"fig = px.scatter_map(\n",
" map_sample.to_pandas(),\n",
" lat=\"lat\", lon=\"long\",\n",
" lat=\"lat\",\n",
" lon=\"long\",\n",
" color=\"error_quick\",\n",
" color_continuous_scale=\"RdBu_r\",\n",
" range_color=[-30, 30],\n",
@ -804002,8 +804082,14 @@
" center={\"lat\": 51.5, \"lon\": -0.1},\n",
" opacity=0.5,\n",
" title=\"Bank — Best transit error (R5 TfL quick), minutes\",\n",
" hover_data={\"pcds\": True, \"best_minutes\": True, \"public_transport_quick_minutes\": True,\n",
" \"error_quick\": \":.0f\", \"lat\": False, \"long\": False},\n",
" hover_data={\n",
" \"pcds\": True,\n",
" \"best_minutes\": True,\n",
" \"public_transport_quick_minutes\": True,\n",
" \"error_quick\": \":.0f\",\n",
" \"lat\": False,\n",
" \"long\": False,\n",
" },\n",
" height=700,\n",
")\n",
"fig.update_layout(map_style=\"carto-positron\")\n",
@ -1204925,7 +1205011,8 @@
"source": [
"fig = px.scatter_map(\n",
" map_sample.to_pandas(),\n",
" lat=\"lat\", lon=\"long\",\n",
" lat=\"lat\",\n",
" lon=\"long\",\n",
" color=\"abs_error_easy\",\n",
" color_continuous_scale=\"YlOrRd\",\n",
" range_color=[0, 30],\n",
@ -1204933,8 +1205020,14 @@
" center={\"lat\": 51.5, \"lon\": -0.1},\n",
" opacity=0.5,\n",
" title=\"Bank — Absolute median transit error |R5 TfL easy|, minutes\",\n",
" hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n",
" \"abs_error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n",
" hover_data={\n",
" \"pcds\": True,\n",
" \"travel_minutes\": True,\n",
" \"public_transport_easy_minutes\": True,\n",
" \"abs_error_easy\": \":.0f\",\n",
" \"lat\": False,\n",
" \"long\": False,\n",
" },\n",
" height=700,\n",
")\n",
"fig.update_layout(map_style=\"carto-positron\")\n",
@ -1204998,9 +1205091,15 @@
],
"source": [
"bank.sort(\"abs_error_easy\", descending=True).select(\n",
" \"pcds\", \"lat\", \"long\",\n",
" \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n",
" \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\",\n",
" \"pcds\",\n",
" \"lat\",\n",
" \"long\",\n",
" \"travel_minutes\",\n",
" \"public_transport_easy_minutes\",\n",
" \"error_easy\",\n",
" \"best_minutes\",\n",
" \"public_transport_quick_minutes\",\n",
" \"error_quick\",\n",
").head(30)"
]
},
@ -1205945,45 +1206044,75 @@
"\n",
"dist_df = bank.with_columns(\n",
" # Rough km distance using Haversine approximation\n",
" ((((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2 +\n",
" ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2) ** 0.5\n",
" (\n",
" (\n",
" ((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2\n",
" + ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2\n",
" )\n",
" ** 0.5\n",
" ).alias(\"dist_km\")\n",
")\n",
"\n",
"# Bin by 5km\n",
"binned = (\n",
" dist_df\n",
" .with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n",
" dist_df.with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n",
" .group_by(\"dist_km\")\n",
" .agg([\n",
" pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n",
" pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n",
" pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n",
" pl.len().alias(\"count\"),\n",
" ])\n",
" .agg(\n",
" [\n",
" pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n",
" pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n",
" pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n",
" pl.len().alias(\"count\"),\n",
" ]\n",
" )\n",
" .sort(\"dist_km\")\n",
" .filter(pl.col(\"count\") > 50)\n",
")\n",
"\n",
"fig = make_subplots(rows=1, cols=2, subplot_titles=[\n",
" \"Median signed error by distance\",\n",
" \"Median absolute error by distance\"\n",
"])\n",
"fig = make_subplots(\n",
" rows=1,\n",
" cols=2,\n",
" subplot_titles=[\n",
" \"Median signed error by distance\",\n",
" \"Median absolute error by distance\",\n",
" ],\n",
")\n",
"\n",
"fig.add_trace(go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_easy\"].to_numpy(),\n",
" mode=\"lines+markers\", name=\"Easy\", line=dict(color=\"#0d9488\")\n",
"), row=1, col=1)\n",
"fig.add_trace(go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_quick\"].to_numpy(),\n",
" mode=\"lines+markers\", name=\"Quick\", line=dict(color=\"#f59e0b\")\n",
"), row=1, col=1)\n",
"fig.add_trace(\n",
" go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(),\n",
" y=binned[\"median_error_easy\"].to_numpy(),\n",
" mode=\"lines+markers\",\n",
" name=\"Easy\",\n",
" line=dict(color=\"#0d9488\"),\n",
" ),\n",
" row=1,\n",
" col=1,\n",
")\n",
"fig.add_trace(\n",
" go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(),\n",
" y=binned[\"median_error_quick\"].to_numpy(),\n",
" mode=\"lines+markers\",\n",
" name=\"Quick\",\n",
" line=dict(color=\"#f59e0b\"),\n",
" ),\n",
" row=1,\n",
" col=1,\n",
")\n",
"\n",
"fig.add_trace(go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_abs_error_easy\"].to_numpy(),\n",
" mode=\"lines+markers\", name=\"|Easy|\", line=dict(color=\"#0d9488\"),\n",
" showlegend=False\n",
"), row=1, col=2)\n",
"fig.add_trace(\n",
" go.Scatter(\n",
" x=binned[\"dist_km\"].to_numpy(),\n",
" y=binned[\"median_abs_error_easy\"].to_numpy(),\n",
" mode=\"lines+markers\",\n",
" name=\"|Easy|\",\n",
" line=dict(color=\"#0d9488\"),\n",
" showlegend=False,\n",
" ),\n",
" row=1,\n",
" col=2,\n",
")\n",
"\n",
"for col in [1, 2]:\n",
" fig.update_xaxes(title_text=\"Distance from Bank (km)\", row=1, col=col)\n",