From c38d654ac7951e2efa649a38ef1f37e61ca64d05 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 15 Mar 2026 21:22:28 +0000 Subject: [PATCH] Fmt --- analyses/bank_postcode_boundaries.ipynb | 22 +- analyses/price_model_evaluation.ipynb | 125 ++- analyses/rightmove_buy.ipynb | 390 +++++-- analyses/source_overlap.ipynb | 255 +++-- analyses/travel_time_comparison.ipynb | 313 ++++-- finder/constants.py | 18 +- finder/homecouk.py | 44 +- finder/http_client.py | 48 +- finder/main.py | 46 +- finder/openrent.py | 78 +- finder/rightmove.py | 11 +- finder/scraper.py | 142 ++- finder/spatial.py | 8 +- finder/storage.py | 4 +- finder/transform.py | 28 +- pipeline/download/lsoa_population.py | 8 +- pipeline/download/places.py | 10 +- pipeline/download/pois.py | 4 +- pipeline/download/rental_prices.py | 12 +- pipeline/download/rightmove_outcodes.py | 16 +- pipeline/download/tiles.py | 4 +- pipeline/download/transit_network.py | 114 +- pipeline/transform/merge.py | 49 +- .../test_postcode_boundaries.py | 4 +- .../transform/price_estimation/backtest.py | 17 +- .../transform/price_estimation/estimate.py | 4 +- pipeline/transform/price_estimation/index.py | 12 +- pipeline/transform/price_estimation/knn.py | 63 +- pipeline/transform/transform_poi.py | 4 +- pipeline/utils/england_geometry.py | 4 +- pipeline/utils/poi_counts.py | 8 +- pipeline/utils/postcode_mapping.py | 45 +- pipeline/utils/test_poi_counts.py | 8 +- price_model.ipynb | 288 ++--- scripts/remove_bg.py | 2 + server-rs/Cargo.toml | 2 +- server-rs/logs/server.log.2026-03-15 | 981 ++++++++++++++++++ server-rs/src/data/postcodes.rs | 7 +- server-rs/src/data/property.rs | 7 +- server-rs/src/pocketbase.rs | 3 +- server-rs/src/routes/ai_filters.rs | 2 +- server-rs/src/routes/hexagons.rs | 11 +- server-rs/src/routes/postcodes.rs | 2 +- server-rs/src/routes/stats.rs | 4 +- 44 files changed, 2526 insertions(+), 701 deletions(-) create mode 100644 server-rs/logs/server.log.2026-03-15 diff --git a/analyses/bank_postcode_boundaries.ipynb b/analyses/bank_postcode_boundaries.ipynb index b4197f3..abd5a95 100644 --- a/analyses/bank_postcode_boundaries.ipynb +++ b/analyses/bank_postcode_boundaries.ipynb @@ -813,8 +813,14 @@ ], "source": [ "# Build area lookup from both sets\n", - "areas_before = {f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"] for f in no_green[\"features\"]}\n", - "areas_after = {f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"] for f in with_green[\"features\"]}\n", + "areas_before = {\n", + " f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"]\n", + " for f in no_green[\"features\"]\n", + "}\n", + "areas_after = {\n", + " f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"]\n", + " for f in with_green[\"features\"]\n", + "}\n", "\n", "# Compute percentage removed\n", "diffs = []\n", @@ -1161,16 +1167,23 @@ "\n", "colormap = cm.LinearColormap(\n", " colors=[\"#ffffcc\", \"#fd8d3c\", \"#e31a1c\", \"#800026\"],\n", - " vmin=0, vmax=min(max_pct, 90),\n", + " vmin=0,\n", + " vmax=min(max_pct, 90),\n", " caption=\"% area removed by greenspace\",\n", ")\n", "\n", + "\n", "# Show original boundaries, colored by how much was removed\n", "def style_by_removal(feature):\n", " pc = feature[\"properties\"][\"postcode\"]\n", " pct = diff_lookup.get(pc, 0)\n", " if pct <= 1:\n", - " return {\"fillColor\": \"#cccccc\", \"color\": \"#999\", \"weight\": 0.5, \"fillOpacity\": 0.15}\n", + " return {\n", + " \"fillColor\": \"#cccccc\",\n", + " \"color\": \"#999\",\n", + " \"weight\": 0.5,\n", + " \"fillOpacity\": 0.15,\n", + " }\n", " return {\n", " \"fillColor\": colormap(min(pct, 90)),\n", " \"color\": \"white\",\n", @@ -1178,6 +1191,7 @@ " \"fillOpacity\": 0.6,\n", " }\n", "\n", + "\n", "folium.GeoJson(\n", " no_green,\n", " name=\"Greenspace removal %\",\n", diff --git a/analyses/price_model_evaluation.ipynb b/analyses/price_model_evaluation.ipynb index d8cf9cc..2bce6b3 100644 --- a/analyses/price_model_evaluation.ipynb +++ b/analyses/price_model_evaluation.ipynb @@ -54,25 +54,32 @@ " ape = np.abs(p - a) / a\n", " err = p - a\n", " return {\n", - " \"MdAPE (%)\": f\"{np.median(ape)*100:.1f}\",\n", - " \"% within 10%\": f\"{np.mean(ape <= 0.10)*100:.1f}\",\n", - " \"% within 20%\": f\"{np.mean(ape <= 0.20)*100:.1f}\",\n", - " \"% within 30%\": f\"{np.mean(ape <= 0.30)*100:.1f}\",\n", + " \"MdAPE (%)\": f\"{np.median(ape) * 100:.1f}\",\n", + " \"% within 10%\": f\"{np.mean(ape <= 0.10) * 100:.1f}\",\n", + " \"% within 20%\": f\"{np.mean(ape <= 0.20) * 100:.1f}\",\n", + " \"% within 30%\": f\"{np.mean(ape <= 0.30) * 100:.1f}\",\n", " \"MAE (\\u00a3)\": f\"{np.mean(np.abs(err)):,.0f}\",\n", " \"Mean signed error (\\u00a3)\": f\"{np.mean(err):+,.0f}\",\n", " \"n\": f\"{len(a):,}\",\n", " }\n", "\n", + "\n", "actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n", "metrics = {\n", - " \"Naive\": compute_metrics(actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)),\n", - " \"Index\": compute_metrics(actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)),\n", + " \"Naive\": compute_metrics(\n", + " actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n", + " ),\n", + " \"Index\": compute_metrics(\n", + " actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n", + " ),\n", "}\n", "\n", - "metrics_table = pl.DataFrame([\n", - " {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n", - " for k in list(metrics[\"Naive\"].keys())\n", - "])\n", + "metrics_table = pl.DataFrame(\n", + " [\n", + " {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n", + " for k in list(metrics[\"Naive\"].keys())\n", + " ]\n", + ")\n", "metrics_table" ] }, @@ -91,8 +98,7 @@ "source": [ "# National index (average across all sectors weighted by n_pairs)\n", "national = (\n", - " index_df\n", - " .group_by(\"year\")\n", + " index_df.group_by(\"year\")\n", " .agg(\n", " (pl.col(\"log_index\") * pl.col(\"n_pairs\")).sum() / pl.col(\"n_pairs\").sum(),\n", " )\n", @@ -107,14 +113,23 @@ "\n", "# If not enough, pick some with high/low n_pairs\n", "if len(sample_sectors) < 3:\n", - " sector_counts = index_df.group_by(\"sector\").agg(pl.col(\"n_pairs\").first()).sort(\"n_pairs\", descending=True)\n", + " sector_counts = (\n", + " index_df.group_by(\"sector\")\n", + " .agg(pl.col(\"n_pairs\").first())\n", + " .sort(\"n_pairs\", descending=True)\n", + " )\n", " top = sector_counts.head(2)[\"sector\"].to_list()\n", " bottom = sector_counts.filter(pl.col(\"n_pairs\") > 0).tail(2)[\"sector\"].to_list()\n", " sample_sectors = list(set(sample_sectors + top + bottom))[:5]\n", "\n", "samples = index_df.filter(pl.col(\"sector\").is_in(sample_sectors))\n", "\n", - "combined = pl.concat([national.select(\"sector\", \"year\", \"log_index\"), samples.select(\"sector\", \"year\", \"log_index\")])\n", + "combined = pl.concat(\n", + " [\n", + " national.select(\"sector\", \"year\", \"log_index\"),\n", + " samples.select(\"sector\", \"year\", \"log_index\"),\n", + " ]\n", + ")\n", "\n", "# Normalize: index = 100 at base year (earliest available)\n", "combined = combined.with_columns(\n", @@ -122,7 +137,10 @@ ")\n", "\n", "fig = px.line(\n", - " combined.to_pandas(), x=\"year\", y=\"index_100\", color=\"sector\",\n", + " combined.to_pandas(),\n", + " x=\"year\",\n", + " y=\"index_100\",\n", + " color=\"sector\",\n", " title=\"Repeat-Sales Price Index (base year = 100)\",\n", " labels={\"index_100\": \"Index (base=100)\", \"year\": \"Year\"},\n", ")\n", @@ -155,8 +173,10 @@ "\n", "fig.update_layout(\n", " title=\"Absolute Percentage Error Distribution\",\n", - " xaxis_title=\"APE (%)\", yaxis_title=\"Count\",\n", - " barmode=\"overlay\", height=500,\n", + " xaxis_title=\"APE (%)\",\n", + " yaxis_title=\"Count\",\n", + " barmode=\"overlay\",\n", + " height=500,\n", ")\n", "fig.show()" ] @@ -183,17 +203,27 @@ "pred = sample[\"predicted\"].to_numpy().astype(np.float64)\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scattergl(\n", - " x=actual_sample, y=pred, mode=\"markers\",\n", - " marker=dict(size=2, opacity=0.3), name=\"Index\",\n", - "))\n", + "fig.add_trace(\n", + " go.Scattergl(\n", + " x=actual_sample,\n", + " y=pred,\n", + " mode=\"markers\",\n", + " marker=dict(size=2, opacity=0.3),\n", + " name=\"Index\",\n", + " )\n", + ")\n", "# 45-degree reference line\n", "min_val = max(10_000, min(actual_sample.min(), np.nanmin(pred)))\n", "max_val = min(5_000_000, max(actual_sample.max(), np.nanmax(pred)))\n", - "fig.add_trace(go.Scatter(\n", - " x=[min_val, max_val], y=[min_val, max_val],\n", - " mode=\"lines\", line=dict(color=\"red\", dash=\"dash\"), showlegend=False,\n", - "))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=[min_val, max_val],\n", + " y=[min_val, max_val],\n", + " mode=\"lines\",\n", + " line=dict(color=\"red\", dash=\"dash\"),\n", + " showlegend=False,\n", + " )\n", + ")\n", "fig.update_xaxes(type=\"log\", title_text=\"Actual (\\u00a3)\")\n", "fig.update_yaxes(type=\"log\", title_text=\"Predicted (\\u00a3)\")\n", "fig.update_layout(title=\"Predicted vs Actual Price (log scale, 10K sample)\", height=500)\n", @@ -234,12 +264,22 @@ " for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n", " ape = np.abs(arr[mask] - actual[mask]) / actual[mask]\n", " valid = np.isfinite(ape)\n", - " rows.append({\"Price Band\": label, \"Method\": name, \"MdAPE (%)\": float(np.median(ape[valid]) * 100)})\n", + " rows.append(\n", + " {\n", + " \"Price Band\": label,\n", + " \"Method\": name,\n", + " \"MdAPE (%)\": float(np.median(ape[valid]) * 100),\n", + " }\n", + " )\n", "\n", "band_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", - " band_df.to_pandas(), x=\"Price Band\", y=\"MdAPE (%)\", color=\"Method\",\n", - " barmode=\"group\", title=\"MdAPE by Price Band\",\n", + " band_df.to_pandas(),\n", + " x=\"Price Band\",\n", + " y=\"MdAPE (%)\",\n", + " color=\"Method\",\n", + " barmode=\"group\",\n", + " title=\"MdAPE by Price Band\",\n", " category_orders={\"Price Band\": [b[2] for b in bands]},\n", ")\n", "fig.update_layout(height=450)\n", @@ -264,7 +304,9 @@ ")\n", "\n", "# Top 20 areas by volume\n", - "top_areas = bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n", + "top_areas = (\n", + " bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n", + ")\n", "\n", "actual_np = bt[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred_np = bt[\"predicted\"].to_numpy().astype(np.float64)\n", @@ -279,12 +321,18 @@ " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", - " rows.append({\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", + " rows.append(\n", + " {\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)}\n", + " )\n", "\n", "area_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", - " area_df.to_pandas(), x=\"Area\", y=\"MdAPE (%)\", color=\"Method\",\n", - " barmode=\"group\", title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n", + " area_df.to_pandas(),\n", + " x=\"Area\",\n", + " y=\"MdAPE (%)\",\n", + " color=\"Method\",\n", + " barmode=\"group\",\n", + " title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n", " category_orders={\"Area\": top_areas},\n", ")\n", "fig.update_layout(height=500)\n", @@ -324,11 +372,20 @@ " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", - " rows.append({\"Gap (years)\": gap, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", + " rows.append(\n", + " {\n", + " \"Gap (years)\": gap,\n", + " \"Method\": name,\n", + " \"MdAPE (%)\": float(np.median(ape) * 100),\n", + " }\n", + " )\n", "\n", "gap_df = pl.DataFrame(rows)\n", "fig = px.line(\n", - " gap_df.to_pandas(), x=\"Gap (years)\", y=\"MdAPE (%)\", color=\"Method\",\n", + " gap_df.to_pandas(),\n", + " x=\"Gap (years)\",\n", + " y=\"MdAPE (%)\",\n", + " color=\"Method\",\n", " title=\"MdAPE by Holding Period (years between input and actual sale)\",\n", " markers=True,\n", ")\n", diff --git a/analyses/rightmove_buy.ipynb b/analyses/rightmove_buy.ipynb index cf806cf..97a0839 100644 --- a/analyses/rightmove_buy.ipynb +++ b/analyses/rightmove_buy.ipynb @@ -52,7 +52,9 @@ "pl.Config.set_tbl_rows(20)\n", "pl.Config.set_fmt_str_lengths(80)\n", "\n", - "df = pl.read_parquet(\"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\")\n", + "df = pl.read_parquet(\n", + " \"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\"\n", + ")\n", "schema = df.schema\n", "print(f\"Total rows: {len(df):,}\")\n", "print(f\"Columns ({len(schema)}):\")\n", @@ -150,11 +152,13 @@ ], "source": [ "# Null counts\n", - "null_df = pl.DataFrame({\n", - " \"column\": df.columns,\n", - " \"nulls\": [df[c].null_count() for c in df.columns],\n", - " \"pct\": [f\"{df[c].null_count()/len(df)*100:.1f}%\" for c in df.columns],\n", - "})\n", + "null_df = pl.DataFrame(\n", + " {\n", + " \"column\": df.columns,\n", + " \"nulls\": [df[c].null_count() for c in df.columns],\n", + " \"pct\": [f\"{df[c].null_count() / len(df) * 100:.1f}%\" for c in df.columns],\n", + " }\n", + ")\n", "null_df.filter(pl.col(\"nulls\") > 0)" ] }, @@ -197,13 +201,17 @@ " \"price = 0\": len(df.filter(pl.col(\"price\") == 0)),\n", " \"price > 50M\": len(df.filter(pl.col(\"price\") > 50_000_000)),\n", " \"floorspace > 10,000 sqm\": len(df.filter(pl.col(\"floorspace_sqm\") > 10_000)),\n", - " \"latitude outside UK (< 49 or > 61)\": len(df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))),\n", - " \"longitude outside UK (< -8 or > 2)\": len(df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))),\n", + " \"latitude outside UK (< 49 or > 61)\": len(\n", + " df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))\n", + " ),\n", + " \"longitude outside UK (< -8 or > 2)\": len(\n", + " df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))\n", + " ),\n", " \"house_share = true\": len(df.filter(pl.col(\"house_share\"))),\n", "}\n", "print(\"Data quality issues:\")\n", "for desc, count in issues.items():\n", - " print(f\" {desc}: {count:,} ({count/len(df)*100:.2f}%)\")" + " print(f\" {desc}: {count:,} ({count / len(df) * 100:.2f}%)\")" ] }, { @@ -230,7 +238,7 @@ " & (pl.col(\"longitude\") >= -8)\n", " & (pl.col(\"longitude\") <= 2)\n", ")\n", - "print(f\"Clean rows: {len(clean):,} ({len(clean)/len(df)*100:.1f}% of original)\")" + "print(f\"Clean rows: {len(clean):,} ({len(clean) / len(df) * 100:.1f}% of original)\")" ] }, { @@ -1126,8 +1134,12 @@ "# Price histogram (clipped to 2nd-98th percentile)\n", "lo, hi = price.quantile(0.02), price.quantile(0.98)\n", "clipped = clean.filter((pl.col(\"price\") >= lo) & (pl.col(\"price\") <= hi))\n", - "fig = px.histogram(clipped.to_pandas(), x=\"price\", nbins=80,\n", - " title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\")\n", + "fig = px.histogram(\n", + " clipped.to_pandas(),\n", + " x=\"price\",\n", + " nbins=80,\n", + " title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\",\n", + ")\n", "fig.update_layout(height=400, xaxis_title=\"Asking Price (£)\", yaxis_title=\"Count\")\n", "fig.show()" ] @@ -439978,9 +439990,13 @@ ], "source": [ "# Price by property type\n", - "fig = px.box(clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n", - " x=\"property_type\", y=\"price\", color=\"property_type\",\n", - " title=\"Price by Property Type (capped at £2M for readability)\")\n", + "fig = px.box(\n", + " clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n", + " x=\"property_type\",\n", + " y=\"price\",\n", + " color=\"property_type\",\n", + " title=\"Price by Property Type (capped at £2M for readability)\",\n", + ")\n", "fig.update_layout(height=500, showlegend=False, yaxis_title=\"Price (£)\")\n", "fig.show()" ] @@ -440079,9 +440095,7 @@ "source": [ "# Price qualifier breakdown\n", "pq = clean[\"price_qualifier\"].value_counts().sort(\"count\", descending=True)\n", - "pq = pq.with_columns(\n", - " (pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\")\n", - ")\n", + "pq = pq.with_columns((pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\"))\n", "pq" ] }, @@ -440928,8 +440942,12 @@ "source": [ "# Property type distribution\n", "type_counts = clean[\"property_type\"].value_counts().sort(\"count\", descending=True)\n", - "fig = px.pie(type_counts.to_pandas(), names=\"property_type\", values=\"count\",\n", - " title=\"Property Type Distribution\")\n", + "fig = px.pie(\n", + " type_counts.to_pandas(),\n", + " names=\"property_type\",\n", + " values=\"count\",\n", + " title=\"Property Type Distribution\",\n", + ")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -441805,9 +441823,16 @@ ], "source": [ "# Top 20 sub-types\n", - "sub_counts = clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n", - "fig = px.bar(sub_counts.to_pandas(), x=\"count\", y=\"property_sub_type\", orientation=\"h\",\n", - " title=\"Top 20 Property Sub-types\")\n", + "sub_counts = (\n", + " clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n", + ")\n", + "fig = px.bar(\n", + " sub_counts.to_pandas(),\n", + " x=\"count\",\n", + " y=\"property_sub_type\",\n", + " orientation=\"h\",\n", + " title=\"Top 20 Property Sub-types\",\n", + ")\n", "fig.update_layout(height=600, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -442643,9 +442668,15 @@ ], "source": [ "# Tenure split\n", - "tenure_counts = clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n", - "fig = px.pie(tenure_counts.to_pandas(), names=\"tenure\", values=\"count\",\n", - " title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count()/len(clean)*100:.1f}% missing)\")\n", + "tenure_counts = (\n", + " clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n", + ")\n", + "fig = px.pie(\n", + " tenure_counts.to_pandas(),\n", + " names=\"tenure\",\n", + " values=\"count\",\n", + " title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count() / len(clean) * 100:.1f}% missing)\",\n", + ")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -443546,8 +443577,14 @@ " .agg(pl.len().alias(\"count\"))\n", " .sort(\"property_type\")\n", ")\n", - "fig = px.bar(tenure_by_type.to_pandas(), x=\"property_type\", y=\"count\", color=\"tenure\",\n", - " barmode=\"group\", title=\"Tenure by Property Type\")\n", + "fig = px.bar(\n", + " tenure_by_type.to_pandas(),\n", + " x=\"property_type\",\n", + " y=\"count\",\n", + " color=\"tenure\",\n", + " barmode=\"group\",\n", + " title=\"Tenure by Property Type\",\n", + ")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -444412,9 +444449,12 @@ ], "source": [ "# Bedroom distribution\n", - "bed_counts = clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n", - "fig = px.bar(bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\",\n", - " title=\"Bedroom Count Distribution\")\n", + "bed_counts = (\n", + " clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n", + ")\n", + "fig = px.bar(\n", + " bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\", title=\"Bedroom Count Distribution\"\n", + ")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -445279,16 +445319,25 @@ ")\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Bar(\n", - " x=price_by_beds[\"bedrooms\"], y=price_by_beds[\"median_price\"],\n", - " name=\"Median\", error_y=dict(type=\"data\",\n", - " symmetric=False,\n", - " array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n", - " arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list()\n", + "fig.add_trace(\n", + " go.Bar(\n", + " x=price_by_beds[\"bedrooms\"],\n", + " y=price_by_beds[\"median_price\"],\n", + " name=\"Median\",\n", + " error_y=dict(\n", + " type=\"data\",\n", + " symmetric=False,\n", + " array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n", + " arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list(),\n", + " ),\n", " )\n", - "))\n", - "fig.update_layout(title=\"Median Price by Bedrooms (with IQR)\", height=400,\n", - " xaxis_title=\"Bedrooms\", yaxis_title=\"Price (£)\")\n", + ")\n", + "fig.update_layout(\n", + " title=\"Median Price by Bedrooms (with IQR)\",\n", + " height=400,\n", + " xaxis_title=\"Bedrooms\",\n", + " yaxis_title=\"Price (£)\",\n", + ")\n", "fig.show()" ] }, @@ -446263,8 +446312,14 @@ " .agg(pl.len().alias(\"count\"))\n", " .sort(\"property_type\", \"bedrooms\")\n", ")\n", - "fig = px.bar(beds_by_type.to_pandas(), x=\"bedrooms\", y=\"count\", color=\"property_type\",\n", - " barmode=\"group\", title=\"Bedroom Distribution by Property Type\")\n", + "fig = px.bar(\n", + " beds_by_type.to_pandas(),\n", + " x=\"bedrooms\",\n", + " y=\"count\",\n", + " color=\"property_type\",\n", + " barmode=\"group\",\n", + " title=\"Bedroom Distribution by Property Type\",\n", + ")\n", "fig.update_layout(height=450)\n", "fig.show()" ] @@ -446323,19 +446378,26 @@ ], "source": [ "# Floorspace availability by property type\n", - "has_floor = clean.with_columns(pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\"))\n", - "floor_by_type = (\n", - " has_floor.group_by(\"property_type\", \"has_floorspace\")\n", - " .agg(pl.len().alias(\"count\"))\n", + "has_floor = clean.with_columns(\n", + " pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\")\n", + ")\n", + "floor_by_type = has_floor.group_by(\"property_type\", \"has_floorspace\").agg(\n", + " pl.len().alias(\"count\")\n", + ")\n", + "totals = floor_by_type.group_by(\"property_type\").agg(\n", + " pl.col(\"count\").sum().alias(\"total\")\n", ")\n", - "totals = floor_by_type.group_by(\"property_type\").agg(pl.col(\"count\").sum().alias(\"total\"))\n", "floor_pct = (\n", " floor_by_type.filter(pl.col(\"has_floorspace\"))\n", " .join(totals, on=\"property_type\")\n", - " .with_columns((pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\"))\n", + " .with_columns(\n", + " (pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\")\n", + " )\n", " .sort(\"pct_with_floorspace\", descending=True)\n", ")\n", - "print(f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%)\")\n", + "print(\n", + " f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%)\"\n", + ")\n", "floor_pct.select(\"property_type\", \"count\", \"total\", \"pct_with_floorspace\")" ] }, @@ -447298,8 +447360,13 @@ ")\n", "print(f\"Properties with reasonable floorspace (10-1000 sqm): {len(with_floor):,}\")\n", "\n", - "fig = px.histogram(with_floor.to_pandas(), x=\"floorspace_sqm\", nbins=80, color=\"property_type\",\n", - " title=\"Floorspace Distribution by Property Type\")\n", + "fig = px.histogram(\n", + " with_floor.to_pandas(),\n", + " x=\"floorspace_sqm\",\n", + " nbins=80,\n", + " color=\"property_type\",\n", + " title=\"Floorspace Distribution by Property Type\",\n", + ")\n", "fig.update_layout(height=450, xaxis_title=\"Floorspace (sqm)\", barmode=\"overlay\")\n", "fig.update_traces(opacity=0.6)\n", "fig.show()" @@ -448176,8 +448243,12 @@ "print(f\" P25: £{s.quantile(0.25):,.0f}/sqm\")\n", "print(f\" P75: £{s.quantile(0.75):,.0f}/sqm\")\n", "\n", - "fig = px.histogram(ppsqm.to_pandas(), x=\"price_per_sqm\", nbins=80,\n", - " title=\"Price per Square Metre Distribution\")\n", + "fig = px.histogram(\n", + " ppsqm.to_pandas(),\n", + " x=\"price_per_sqm\",\n", + " nbins=80,\n", + " title=\"Price per Square Metre Distribution\",\n", + ")\n", "fig.update_layout(height=400, xaxis_title=\"Price per sqm (£)\")\n", "fig.show()" ] @@ -584906,8 +584977,13 @@ } ], "source": [ - "fig = px.box(ppsqm.to_pandas(), x=\"property_type\", y=\"price_per_sqm\", color=\"property_type\",\n", - " title=\"Price per sqm by Property Type\")\n", + "fig = px.box(\n", + " ppsqm.to_pandas(),\n", + " x=\"property_type\",\n", + " y=\"price_per_sqm\",\n", + " color=\"property_type\",\n", + " title=\"Price per sqm by Property Type\",\n", + ")\n", "fig.update_layout(height=450, showlegend=False, yaxis_title=\"£ per sqm\")\n", "fig.show()" ] @@ -585865,9 +585941,15 @@ ")\n", "\n", "top30 = outcode_stats.head(30)\n", - "fig = px.bar(top30.to_pandas(), x=\"count\", y=\"outcode\", orientation=\"h\",\n", - " color=\"median_price\", color_continuous_scale=\"Viridis\",\n", - " title=\"Top 30 Outcodes by Listing Volume\")\n", + "fig = px.bar(\n", + " top30.to_pandas(),\n", + " x=\"count\",\n", + " y=\"outcode\",\n", + " orientation=\"h\",\n", + " color=\"median_price\",\n", + " color_continuous_scale=\"Viridis\",\n", + " title=\"Top 30 Outcodes by Listing Volume\",\n", + ")\n", "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -587400,11 +587482,25 @@ ], "source": [ "# Most expensive outcodes (min 50 listings)\n", - "expensive = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\", descending=True).head(30)\n", - "fig = px.bar(expensive.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n", - " color=\"count\", color_continuous_scale=\"Blues\",\n", - " title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\")\n", - "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"}, xaxis_title=\"Median Price (£)\")\n", + "expensive = (\n", + " outcode_stats.filter(pl.col(\"count\") >= 50)\n", + " .sort(\"median_price\", descending=True)\n", + " .head(30)\n", + ")\n", + "fig = px.bar(\n", + " expensive.to_pandas(),\n", + " x=\"median_price\",\n", + " y=\"outcode\",\n", + " orientation=\"h\",\n", + " color=\"count\",\n", + " color_continuous_scale=\"Blues\",\n", + " title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\",\n", + ")\n", + "fig.update_layout(\n", + " height=700,\n", + " yaxis={\"categoryorder\": \"total ascending\"},\n", + " xaxis_title=\"Median Price (£)\",\n", + ")\n", "fig.show()" ] }, @@ -588914,10 +589010,20 @@ "source": [ "# Cheapest outcodes (min 50 listings)\n", "cheapest = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\").head(30)\n", - "fig = px.bar(cheapest.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n", - " color=\"count\", color_continuous_scale=\"Blues\",\n", - " title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\")\n", - "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total descending\"}, xaxis_title=\"Median Price (£)\")\n", + "fig = px.bar(\n", + " cheapest.to_pandas(),\n", + " x=\"median_price\",\n", + " y=\"outcode\",\n", + " orientation=\"h\",\n", + " color=\"count\",\n", + " color_continuous_scale=\"Blues\",\n", + " title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\",\n", + ")\n", + "fig.update_layout(\n", + " height=700,\n", + " yaxis={\"categoryorder\": \"total descending\"},\n", + " xaxis_title=\"Median Price (£)\",\n", + ")\n", "fig.show()" ] }, @@ -589828,14 +589934,19 @@ "source": [ "# Geographic scatter of listings (sample for performance)\n", "sample = clean.sample(n=min(20_000, len(clean)), seed=42)\n", - "fig = px.scatter_map(sample.to_pandas(),\n", - " lat=\"latitude\", lon=\"longitude\",\n", - " color=\"price\", size_max=4,\n", - " color_continuous_scale=\"Viridis\",\n", - " range_color=[100_000, 1_500_000],\n", - " zoom=5, center={\"lat\": 52.5, \"lon\": -1.5},\n", - " title=\"Listing Locations (20k sample, colored by price)\",\n", - " opacity=0.4)\n", + "fig = px.scatter_map(\n", + " sample.to_pandas(),\n", + " lat=\"latitude\",\n", + " lon=\"longitude\",\n", + " color=\"price\",\n", + " size_max=4,\n", + " color_continuous_scale=\"Viridis\",\n", + " range_color=[100_000, 1_500_000],\n", + " zoom=5,\n", + " center={\"lat\": 52.5, \"lon\": -1.5},\n", + " title=\"Listing Locations (20k sample, colored by price)\",\n", + " opacity=0.4,\n", + ")\n", "fig.update_layout(height=700)\n", "fig.show()" ] @@ -589864,7 +589975,9 @@ "source": [ "# Parse dates and look at listing age\n", "with_dates = clean.with_columns(\n", - " pl.col(\"first_visible_date\").str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\").alias(\"listed_at\"),\n", + " pl.col(\"first_visible_date\")\n", + " .str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\")\n", + " .alias(\"listed_at\"),\n", ")\n", "\n", "print(f\"Date range: {with_dates['listed_at'].min()} to {with_dates['listed_at'].max()}\")" @@ -590856,8 +590969,9 @@ " .sort(\"month\")\n", ")\n", "\n", - "fig = px.bar(monthly.to_pandas(), x=\"month\", y=\"count\",\n", - " title=\"Listings by Month Listed\")\n", + "fig = px.bar(\n", + " monthly.to_pandas(), x=\"month\", y=\"count\", title=\"Listings by Month Listed\"\n", + ")\n", "fig.update_layout(height=400, xaxis_title=\"Month\", yaxis_title=\"Listings\")\n", "fig.show()" ] @@ -590884,6 +590998,7 @@ "source": [ "# How old are current listings? (days since first visible)\n", "import datetime\n", + "\n", "now = datetime.datetime(2026, 2, 14)\n", "with_age = with_dates.with_columns(\n", " ((pl.lit(now) - pl.col(\"listed_at\")).dt.total_days()).alias(\"days_on_market\")\n", @@ -590896,7 +591011,7 @@ "print(f\" P25: {age.quantile(0.25):.0f} days\")\n", "print(f\" P75: {age.quantile(0.75):.0f} days\")\n", "print(f\" P95: {age.quantile(0.95):.0f} days\")\n", - "print(f\" Max: {age.max():.0f} days ({age.max()/365:.1f} years)\")" + "print(f\" Max: {age.max():.0f} days ({age.max() / 365:.1f} years)\")" ] }, { @@ -591749,8 +591864,12 @@ "source": [ "# Days on market distribution (cap at 2 years for readability)\n", "capped = with_age.filter(pl.col(\"days_on_market\") <= 730)\n", - "fig = px.histogram(capped.to_pandas(), x=\"days_on_market\", nbins=100,\n", - " title=\"Days on Market Distribution (capped at 2 years)\")\n", + "fig = px.histogram(\n", + " capped.to_pandas(),\n", + " x=\"days_on_market\",\n", + " nbins=100,\n", + " title=\"Days on Market Distribution (capped at 2 years)\",\n", + ")\n", "fig.update_layout(height=400, xaxis_title=\"Days on Market\", yaxis_title=\"Count\")\n", "fig.show()" ] @@ -591883,11 +592002,13 @@ "# Explode features list and count most common\n", "features_exploded = clean.select(\"features\").explode(\"features\").drop_nulls()\n", "print(f\"Total feature entries: {len(features_exploded):,}\")\n", - "print(f\"Features per listing: {len(features_exploded)/len(clean):.1f} avg\")\n", + "print(f\"Features per listing: {len(features_exploded) / len(clean):.1f} avg\")\n", "\n", "# Most common features (lowercased for grouping)\n", "feature_counts = (\n", - " features_exploded.with_columns(pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\"))\n", + " features_exploded.with_columns(\n", + " pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\")\n", + " )\n", " .group_by(\"feature_lower\")\n", " .agg(pl.len().alias(\"count\"))\n", " .sort(\"count\", descending=True)\n", @@ -592794,16 +592915,64 @@ "all_features = features_exploded[\"features\"].to_list()\n", "word_counter = Counter()\n", "for feat in all_features:\n", - " words = re.findall(r'[a-z]+', feat.lower())\n", + " words = re.findall(r\"[a-z]+\", feat.lower())\n", " word_counter.update(words)\n", "\n", "# Filter out very short/common words\n", - "stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'with', 'for', 'on', 'at', 'by', 'is', 'it', 'from', 'as', 'be', 'this', 'that', 'are', 'was', 'has', 'have', 'not', 'but', 'all', 'can', 'had', 'her', 'his', 'one', 'our', 'out', 'you', 'will'}\n", - "keywords = [(w, c) for w, c in word_counter.most_common(100) if w not in stop_words and len(w) > 2]\n", - "kw_df = pl.DataFrame({\"word\": [w for w,c in keywords[:40]], \"count\": [c for w,c in keywords[:40]]})\n", + "stop_words = {\n", + " \"the\",\n", + " \"a\",\n", + " \"an\",\n", + " \"and\",\n", + " \"or\",\n", + " \"of\",\n", + " \"to\",\n", + " \"in\",\n", + " \"with\",\n", + " \"for\",\n", + " \"on\",\n", + " \"at\",\n", + " \"by\",\n", + " \"is\",\n", + " \"it\",\n", + " \"from\",\n", + " \"as\",\n", + " \"be\",\n", + " \"this\",\n", + " \"that\",\n", + " \"are\",\n", + " \"was\",\n", + " \"has\",\n", + " \"have\",\n", + " \"not\",\n", + " \"but\",\n", + " \"all\",\n", + " \"can\",\n", + " \"had\",\n", + " \"her\",\n", + " \"his\",\n", + " \"one\",\n", + " \"our\",\n", + " \"out\",\n", + " \"you\",\n", + " \"will\",\n", + "}\n", + "keywords = [\n", + " (w, c)\n", + " for w, c in word_counter.most_common(100)\n", + " if w not in stop_words and len(w) > 2\n", + "]\n", + "kw_df = pl.DataFrame(\n", + " {\"word\": [w for w, c in keywords[:40]], \"count\": [c for w, c in keywords[:40]]}\n", + ")\n", "\n", - "fig = px.bar(kw_df.to_pandas(), x=\"count\", y=\"word\", orientation=\"h\",\n", - " title=\"Most Common Words in Feature Descriptions\")\n", + "fig = px.bar(\n", + " kw_df.to_pandas(),\n", + " x=\"count\",\n", + " y=\"word\",\n", + " orientation=\"h\",\n", + " title=\"Most Common Words in Feature Descriptions\",\n", + ")\n", "fig.update_layout(height=800, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -593767,9 +593936,14 @@ " & (pl.col(\"price\") < 3_000_000)\n", ").sample(n=min(15_000, len(with_floor)), seed=42)\n", "\n", - "fig = px.scatter(scatter_df.to_pandas(), x=\"floorspace_sqm\", y=\"price\",\n", - " color=\"property_type\", opacity=0.3,\n", - " title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\")\n", + "fig = px.scatter(\n", + " scatter_df.to_pandas(),\n", + " x=\"floorspace_sqm\",\n", + " y=\"price\",\n", + " color=\"property_type\",\n", + " opacity=0.3,\n", + " title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\",\n", + ")\n", "fig.update_layout(height=600, xaxis_title=\"Floorspace (sqm)\", yaxis_title=\"Price (£)\")\n", "fig.show()" ] @@ -594739,8 +594913,14 @@ " .agg(pl.col(\"price\").median().alias(\"median_price\"), pl.len().alias(\"count\"))\n", " .sort(\"property_type\", \"bedrooms\")\n", ")\n", - "fig = px.line(bp.to_pandas(), x=\"bedrooms\", y=\"median_price\", color=\"property_type\",\n", - " markers=True, title=\"Median Price by Bedrooms and Property Type\")\n", + "fig = px.line(\n", + " bp.to_pandas(),\n", + " x=\"bedrooms\",\n", + " y=\"median_price\",\n", + " color=\"property_type\",\n", + " markers=True,\n", + " title=\"Median Price by Bedrooms and Property Type\",\n", + ")\n", "fig.update_layout(height=450, xaxis_title=\"Bedrooms\", yaxis_title=\"Median Price (£)\")\n", "fig.show()" ] @@ -594789,18 +594969,28 @@ "print(f\"Total listings: {len(clean):,}\")\n", "print(f\"Outcodes covered: {clean['outcode'].n_unique():,}\")\n", "print(\"\")\n", - "print(f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\")\n", - "print(f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\")\n", + "print(\n", + " f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\"\n", + ")\n", + "print(\n", + " f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\"\n", + ")\n", "print(\"\")\n", - "print(f\"Tenure known: {(len(clean) - clean['tenure'].null_count())/len(clean)*100:.1f}%\")\n", + "print(\n", + " f\"Tenure known: {(len(clean) - clean['tenure'].null_count()) / len(clean) * 100:.1f}%\"\n", + ")\n", "print(f\" Freehold: {len(clean.filter(pl.col('tenure') == 'Freehold')):,}\")\n", "print(f\" Leasehold: {len(clean.filter(pl.col('tenure') == 'Leasehold')):,}\")\n", "print(\"\")\n", - "print(f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%\")\n", + "print(\n", + " f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%\"\n", + ")\n", "print(\"\")\n", "print(\"Property types:\")\n", - "for row in clean['property_type'].value_counts().sort('count', descending=True).iter_rows():\n", - " print(f\" {row[0]}: {row[1]:,} ({row[1]/len(clean)*100:.1f}%)\")" + "for row in (\n", + " clean[\"property_type\"].value_counts().sort(\"count\", descending=True).iter_rows()\n", + "):\n", + " print(f\" {row[0]}: {row[1]:,} ({row[1] / len(clean) * 100:.1f}%)\")" ] } ], diff --git a/analyses/source_overlap.ipynb b/analyses/source_overlap.ipynb index 399ab3a..45621b9 100644 --- a/analyses/source_overlap.ipynb +++ b/analyses/source_overlap.ipynb @@ -52,6 +52,7 @@ "buy = pl.read_parquet(f\"{DATA}/online_listings_buy.parquet\")\n", "rent = pl.read_parquet(f\"{DATA}/online_listings_rent.parquet\")\n", "\n", + "\n", "def tag_source(df: pl.DataFrame) -> pl.DataFrame:\n", " return df.with_columns(\n", " pl.when(pl.col(\"Listing URL\").str.contains(\"rightmove\"))\n", @@ -62,6 +63,7 @@ " .alias(\"source\")\n", " )\n", "\n", + "\n", "buy = tag_source(buy)\n", "rent = tag_source(rent)\n", "\n", @@ -122,7 +124,7 @@ " print(f\"\\n=== {label} ===\")\n", " for row in counts.iter_rows():\n", " src, cnt = row\n", - " print(f\" {src}: {cnt:,} ({cnt/len(df)*100:.1f}%)\")\n", + " print(f\" {src}: {cnt:,} ({cnt / len(df) * 100:.1f}%)\")\n", "\n", "# Known dedup count from scraper logs\n", "CROSS_DEDUP_BUY = 2_220\n", @@ -132,7 +134,7 @@ "print(f\"Home.co.uk scraped (before dedup): {hk_buy_total:,}\")\n", "print(f\"Home.co.uk unique (after dedup): {hk_buy_unique:,}\")\n", "print(f\"Cross-source duplicates removed: {CROSS_DEDUP_BUY:,}\")\n", - "print(f\"Overlap rate: {CROSS_DEDUP_BUY/hk_buy_total*100:.1f}%\")" + "print(f\"Overlap rate: {CROSS_DEDUP_BUY / hk_buy_total * 100:.1f}%\")" ] }, { @@ -987,23 +989,29 @@ "# Venn-style summary\n", "rm_buy = len(buy.filter(pl.col(\"source\") == \"Rightmove\"))\n", "\n", - "fig = go.Figure(go.Sankey(\n", - " node=dict(\n", - " label=[\n", - " f\"Rightmove\\n{rm_buy:,}\",\n", - " f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n", - " f\"Merged BUY\\n{len(buy):,}\",\n", - " f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n", - " ],\n", - " color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n", - " ),\n", - " link=dict(\n", - " source=[0, 1, 1],\n", - " target=[2, 2, 3],\n", - " value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n", - " color=[\"rgba(37,99,235,0.3)\", \"rgba(16,185,129,0.3)\", \"rgba(239,68,68,0.3)\"],\n", - " ),\n", - "))\n", + "fig = go.Figure(\n", + " go.Sankey(\n", + " node=dict(\n", + " label=[\n", + " f\"Rightmove\\n{rm_buy:,}\",\n", + " f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n", + " f\"Merged BUY\\n{len(buy):,}\",\n", + " f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n", + " ],\n", + " color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n", + " ),\n", + " link=dict(\n", + " source=[0, 1, 1],\n", + " target=[2, 2, 3],\n", + " value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n", + " color=[\n", + " \"rgba(37,99,235,0.3)\",\n", + " \"rgba(16,185,129,0.3)\",\n", + " \"rgba(239,68,68,0.3)\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", "fig.update_layout(title=\"BUY Channel: Source Contribution Flow\", height=350)\n", "fig.show()" ] @@ -1106,8 +1114,11 @@ "oc_comparison = (\n", " hk_by_oc.join(rm_by_oc, on=\"outcode\", how=\"left\")\n", " .with_columns(\n", - " (pl.col(\"hk_count\") / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0)) * 100)\n", - " .alias(\"hk_pct_of_total\")\n", + " (\n", + " pl.col(\"hk_count\")\n", + " / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0))\n", + " * 100\n", + " ).alias(\"hk_pct_of_total\")\n", " )\n", " .sort(\"hk_count\", descending=True)\n", ")\n", @@ -2215,18 +2226,28 @@ "source": [ "# Bar chart: home.co.uk vs Rightmove counts per outcode\n", "fig = go.Figure()\n", - "fig.add_trace(go.Bar(\n", - " x=oc_comparison[\"outcode\"], y=oc_comparison[\"rm_count\"],\n", - " name=\"Rightmove\", marker_color=\"#2563eb\",\n", - "))\n", - "fig.add_trace(go.Bar(\n", - " x=oc_comparison[\"outcode\"], y=oc_comparison[\"hk_count\"],\n", - " name=\"Home.co.uk\", marker_color=\"#10b981\",\n", - "))\n", + "fig.add_trace(\n", + " go.Bar(\n", + " x=oc_comparison[\"outcode\"],\n", + " y=oc_comparison[\"rm_count\"],\n", + " name=\"Rightmove\",\n", + " marker_color=\"#2563eb\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Bar(\n", + " x=oc_comparison[\"outcode\"],\n", + " y=oc_comparison[\"hk_count\"],\n", + " name=\"Home.co.uk\",\n", + " marker_color=\"#10b981\",\n", + " )\n", + ")\n", "fig.update_layout(\n", - " barmode=\"group\", height=400,\n", + " barmode=\"group\",\n", + " height=400,\n", " title=\"Listings per Outcode: Rightmove vs Home.co.uk (outcodes with HK coverage)\",\n", - " xaxis_title=\"Outcode\", yaxis_title=\"Listings\",\n", + " xaxis_title=\"Outcode\",\n", + " yaxis_title=\"Listings\",\n", ")\n", "fig.show()" ] @@ -3121,10 +3142,14 @@ "sample = covered.sample(n=min(30_000, len(covered)), seed=42)\n", "\n", "fig = px.scatter_map(\n", - " sample.to_pandas(), lat=\"lat\", lon=\"lon\",\n", + " sample.to_pandas(),\n", + " lat=\"lat\",\n", + " lon=\"lon\",\n", " color=\"source\",\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " zoom=7, opacity=0.4, size_max=4,\n", + " zoom=7,\n", + " opacity=0.4,\n", + " size_max=4,\n", " title=\"Listing Locations in Covered Outcodes (by source)\",\n", ")\n", "fig.update_layout(height=600)\n", @@ -3188,15 +3213,41 @@ "# For covered outcodes, compare home.co.uk listings against Rightmove\n", "# to find near-matches (same postcode, same beds, price within 5%)\n", "\n", - "hk = buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\").select(\n", - " \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n", - ").rename({\"Asking price\": \"hk_price\", \"Property type\": \"hk_type\", \"Address per Property Register\": \"hk_addr\"})\n", + "hk = (\n", + " buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\")\n", + " .select(\n", + " \"Postcode\",\n", + " \"Bedrooms\",\n", + " \"Asking price\",\n", + " \"Property type\",\n", + " \"Address per Property Register\",\n", + " )\n", + " .rename(\n", + " {\n", + " \"Asking price\": \"hk_price\",\n", + " \"Property type\": \"hk_type\",\n", + " \"Address per Property Register\": \"hk_addr\",\n", + " }\n", + " )\n", + ")\n", "\n", - "rm = buy_oc.filter(\n", - " pl.col(\"source\") == \"Rightmove\"\n", - ").select(\n", - " \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n", - ").rename({\"Asking price\": \"rm_price\", \"Property type\": \"rm_type\", \"Address per Property Register\": \"rm_addr\"})\n", + "rm = (\n", + " buy_oc.filter(pl.col(\"source\") == \"Rightmove\")\n", + " .select(\n", + " \"Postcode\",\n", + " \"Bedrooms\",\n", + " \"Asking price\",\n", + " \"Property type\",\n", + " \"Address per Property Register\",\n", + " )\n", + " .rename(\n", + " {\n", + " \"Asking price\": \"rm_price\",\n", + " \"Property type\": \"rm_type\",\n", + " \"Address per Property Register\": \"rm_addr\",\n", + " }\n", + " )\n", + ")\n", "\n", "# Join on postcode + bedrooms\n", "joined = hk.join(rm, on=[\"Postcode\", \"Bedrooms\"], how=\"inner\")\n", @@ -3213,16 +3264,24 @@ "exact = joined.filter(pl.col(\"hk_price\") == pl.col(\"rm_price\"))\n", "\n", "print(f\"Home.co.uk listings (unique, in file): {len(hk):,}\")\n", - "print(f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\")\n", + "print(\n", + " f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\"\n", + ")\n", "print()\n", "print(f\"Joined on (postcode, bedrooms): {len(joined):,} candidate pairs\")\n", - "print(f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\")\n", - "print(f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\")\n", + "print(\n", + " f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\"\n", + ")\n", + "print(\n", + " f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\"\n", + ")\n", "print()\n", "# Unique hk listings that have at least one near-match\n", "hk_with_near = near.select(\"hk_price\", \"hk_addr\", \"Postcode\").unique()\n", "print(f\"Home.co.uk listings with a near-match in RM: ~{len(hk_with_near):,}\")\n", - "print(f\"Estimated additional overlap: ~{len(hk_with_near)/len(hk)*100:.1f}% of unique HK listings\")" + "print(\n", + " f\"Estimated additional overlap: ~{len(hk_with_near) / len(hk) * 100:.1f}% of unique HK listings\"\n", + ")" ] }, { @@ -4178,9 +4237,13 @@ ")\n", "\n", "fig = px.histogram(\n", - " clipped.to_pandas(), x=\"Asking price\", color=\"source\", nbins=80,\n", + " clipped.to_pandas(),\n", + " x=\"Asking price\",\n", + " color=\"source\",\n", + " nbins=80,\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " barmode=\"overlay\", histnorm=\"probability density\",\n", + " barmode=\"overlay\",\n", + " histnorm=\"probability density\",\n", " title=\"Price Distribution by Source (normalised, £50k–£2M)\",\n", ")\n", "fig.update_traces(opacity=0.6)\n", @@ -5095,10 +5158,7 @@ ], "source": [ "# Property type distribution by source\n", - "type_by_src = (\n", - " buy.group_by(\"source\", \"Property type\")\n", - " .agg(pl.len().alias(\"count\"))\n", - ")\n", + "type_by_src = buy.group_by(\"source\", \"Property type\").agg(pl.len().alias(\"count\"))\n", "# Normalise within each source\n", "totals = type_by_src.group_by(\"source\").agg(pl.col(\"count\").sum().alias(\"total\"))\n", "type_by_src = type_by_src.join(totals, on=\"source\").with_columns(\n", @@ -5107,7 +5167,10 @@ "\n", "fig = px.bar(\n", " type_by_src.sort(\"Property type\").to_pandas(),\n", - " x=\"Property type\", y=\"pct\", color=\"source\", barmode=\"group\",\n", + " x=\"Property type\",\n", + " y=\"pct\",\n", + " color=\"source\",\n", + " barmode=\"group\",\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", " title=\"Property Type Distribution by Source (%)\",\n", ")\n", @@ -5186,7 +5249,9 @@ "# Property sub-type comparison — top home.co.uk sub-types\n", "hk_subtypes = (\n", " buy.filter(pl.col(\"source\") == \"Home.co.uk\")[\"Property sub-type\"]\n", - " .value_counts().sort(\"count\", descending=True).head(20)\n", + " .value_counts()\n", + " .sort(\"count\", descending=True)\n", + " .head(20)\n", ")\n", "print(\"Top 20 Home.co.uk property sub-types:\")\n", "hk_subtypes" @@ -5263,9 +5328,16 @@ "source": [ "# Field completeness by source\n", "fields = [\n", - " \"Bedrooms\", \"Bathrooms\", \"Postcode\", \"Address per Property Register\",\n", - " \"Leasehold/Freehold\", \"Property type\", \"Total floor area (sqm)\",\n", - " \"Listing date\", \"Asking price\", \"Price qualifier\",\n", + " \"Bedrooms\",\n", + " \"Bathrooms\",\n", + " \"Postcode\",\n", + " \"Address per Property Register\",\n", + " \"Leasehold/Freehold\",\n", + " \"Property type\",\n", + " \"Total floor area (sqm)\",\n", + " \"Listing date\",\n", + " \"Asking price\",\n", + " \"Price qualifier\",\n", "]\n", "\n", "rows = []\n", @@ -5276,17 +5348,19 @@ " non_null = n - subset[f].null_count()\n", " # Also count empty strings as missing for string fields\n", " if subset[f].dtype == pl.Utf8:\n", - " non_null = len(subset.filter(\n", - " pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0)\n", - " ))\n", + " non_null = len(\n", + " subset.filter(pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0))\n", + " )\n", " rows.append({\"source\": src, \"field\": f, \"pct_available\": non_null / n * 100})\n", "\n", "completeness = pl.DataFrame(rows)\n", "pivot = completeness.pivot(on=\"source\", index=\"field\", values=\"pct_available\")\n", - "pivot = pivot.with_columns([\n", - " pl.col(\"Rightmove\").round(1),\n", - " pl.col(\"Home.co.uk\").round(1),\n", - "])\n", + "pivot = pivot.with_columns(\n", + " [\n", + " pl.col(\"Rightmove\").round(1),\n", + " pl.col(\"Home.co.uk\").round(1),\n", + " ]\n", + ")\n", "print(\"Field completeness (% non-null/non-empty):\")\n", "pivot" ] @@ -6198,19 +6272,26 @@ "# Bedroom distribution comparison\n", "fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Rightmove\", \"Home.co.uk\"))\n", "for i, src in enumerate([\"Rightmove\", \"Home.co.uk\"], 1):\n", - " beds = buy.filter(\n", - " (pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8)\n", - " )[\"Bedrooms\"].value_counts().sort(\"Bedrooms\")\n", + " beds = (\n", + " buy.filter((pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8))[\"Bedrooms\"]\n", + " .value_counts()\n", + " .sort(\"Bedrooms\")\n", + " )\n", " # Normalise\n", " total = beds[\"count\"].sum()\n", " fig.add_trace(\n", " go.Bar(\n", - " x=beds[\"Bedrooms\"], y=beds[\"count\"] / total * 100,\n", + " x=beds[\"Bedrooms\"],\n", + " y=beds[\"count\"] / total * 100,\n", " name=src,\n", " marker_color=\"#2563eb\" if src == \"Rightmove\" else \"#10b981\",\n", - " ), row=1, col=i,\n", + " ),\n", + " row=1,\n", + " col=i,\n", " )\n", - "fig.update_layout(height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False)\n", + "fig.update_layout(\n", + " height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False\n", + ")\n", "fig.update_yaxes(title_text=\"%\", row=1, col=1)\n", "fig.show()" ] @@ -6287,17 +6368,23 @@ "\n", "comparison_rows = []\n", "for ptype in [\"Detached\", \"Semi-Detached\", \"Terraced\", \"Flats/Maisonettes\", \"Other\"]:\n", - " rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n", + " rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\n", + " \"Asking price\"\n", + " ].drop_nulls()\n", " hk_p = hk_only.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n", " if len(rm_p) > 0 and len(hk_p) > 0:\n", - " comparison_rows.append({\n", - " \"Property type\": ptype,\n", - " \"RM count\": len(rm_p),\n", - " \"RM median £\": int(rm_p.median()),\n", - " \"HK count\": len(hk_p),\n", - " \"HK median £\": int(hk_p.median()),\n", - " \"HK premium %\": round((hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1),\n", - " })\n", + " comparison_rows.append(\n", + " {\n", + " \"Property type\": ptype,\n", + " \"RM count\": len(rm_p),\n", + " \"RM median £\": int(rm_p.median()),\n", + " \"HK count\": len(hk_p),\n", + " \"HK median £\": int(hk_p.median()),\n", + " \"HK premium %\": round(\n", + " (hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1\n", + " ),\n", + " }\n", + " )\n", "\n", "comp = pl.DataFrame(comparison_rows)\n", "print(\"Price comparison in covered outcodes (Home.co.uk unique listings vs Rightmove):\")\n", @@ -7245,9 +7332,13 @@ "# Listing age histogram comparison\n", "age_plot = with_age.filter(pl.col(\"days_on_market\") <= 730) # cap at 2 years\n", "fig = px.histogram(\n", - " age_plot.to_pandas(), x=\"days_on_market\", color=\"source\", nbins=60,\n", + " age_plot.to_pandas(),\n", + " x=\"days_on_market\",\n", + " color=\"source\",\n", + " nbins=60,\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " barmode=\"overlay\", histnorm=\"probability density\",\n", + " barmode=\"overlay\",\n", + " histnorm=\"probability density\",\n", " title=\"Days on Market Distribution by Source (normalised, capped at 2 years)\",\n", ")\n", "fig.update_traces(opacity=0.6)\n", @@ -7330,7 +7421,9 @@ "print(f\" Projected home.co.uk total: ~{projected_hk:,}\")\n", "print(f\" Projected cross-dedup: ~{projected_dedup:,}\")\n", "print(f\" Projected unique additions: ~{projected_unique:,}\")\n", - "print(f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique/rm_buy*100:.1f}% increase)\")\n", + "print(\n", + " f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique / rm_buy * 100:.1f}% increase)\"\n", + ")\n", "print()\n", "print(\"⚠️ These are rough estimates — the covered outcodes may not be representative\")" ] diff --git a/analyses/travel_time_comparison.ipynb b/analyses/travel_time_comparison.ipynb index 7fae5a2..1b2ead8 100644 --- a/analyses/travel_time_comparison.ipynb +++ b/analyses/travel_time_comparison.ipynb @@ -54,11 +54,15 @@ } ], "source": [ - "r5_bank = pl.read_parquet(\"../property-data/travel-times/transit/000000-bank-tube-station.parquet\")\n", + "r5_bank = pl.read_parquet(\n", + " \"../property-data/travel-times/transit/000000-bank-tube-station.parquet\"\n", + ")\n", "manual_bank = pl.read_parquet(\"../manual-data/journey_times_bank.parquet\")\n", "\n", "print(f\"R5 Bank: {r5_bank.shape[0]:,} postcodes\")\n", - "print(f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\")" + "print(\n", + " f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\"\n", + ")" ] }, { @@ -116,25 +120,49 @@ "source": [ "# Join on postcode, keep only rows where both sources have values\n", "bank = (\n", - " r5_bank\n", - " .join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n", + " r5_bank.join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n", " .filter(\n", " pl.col(\"public_transport_easy_minutes\").is_not_null()\n", " & pl.col(\"public_transport_quick_minutes\").is_not_null()\n", " )\n", - " .with_columns([\n", - " # Signed error: R5 - Manual (positive = R5 is slower)\n", - " (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).alias(\"error_easy\"),\n", - " (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).alias(\"error_quick\"),\n", - " # Absolute error\n", - " (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_easy\"),\n", - " (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_quick\"),\n", - " ])\n", + " .with_columns(\n", + " [\n", + " # Signed error: R5 - Manual (positive = R5 is slower)\n", + " (\n", + " pl.col(\"travel_minutes\").cast(pl.Float64)\n", + " - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n", + " ).alias(\"error_easy\"),\n", + " (\n", + " pl.col(\"best_minutes\").cast(pl.Float64)\n", + " - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n", + " ).alias(\"error_quick\"),\n", + " # Absolute error\n", + " (\n", + " pl.col(\"travel_minutes\").cast(pl.Float64)\n", + " - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n", + " )\n", + " .abs()\n", + " .alias(\"abs_error_easy\"),\n", + " (\n", + " pl.col(\"best_minutes\").cast(pl.Float64)\n", + " - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n", + " )\n", + " .abs()\n", + " .alias(\"abs_error_quick\"),\n", + " ]\n", + " )\n", ")\n", "\n", "print(f\"Joined (non-null): {bank.shape[0]:,} postcodes\")\n", - "bank.select(\"pcds\", \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n", - " \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\").head(10)" + "bank.select(\n", + " \"pcds\",\n", + " \"travel_minutes\",\n", + " \"public_transport_easy_minutes\",\n", + " \"error_easy\",\n", + " \"best_minutes\",\n", + " \"public_transport_quick_minutes\",\n", + " \"error_quick\",\n", + ").head(10)" ] }, { @@ -196,18 +224,23 @@ " percentiles = [5, 25, 50, 80, 90, 95, 99]\n", " rows = []\n", " for p in percentiles:\n", - " rows.append({\n", - " \"percentile\": f\"p{p}\",\n", - " f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n", - " f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n", - " })\n", - " rows.append({\n", - " \"percentile\": \"mean\",\n", - " f\"{label} signed error\": round(float(np.mean(col)), 1),\n", - " f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n", - " })\n", + " rows.append(\n", + " {\n", + " \"percentile\": f\"p{p}\",\n", + " f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n", + " f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n", + " }\n", + " )\n", + " rows.append(\n", + " {\n", + " \"percentile\": \"mean\",\n", + " f\"{label} signed error\": round(float(np.mean(col)), 1),\n", + " f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n", + " }\n", + " )\n", " return pl.DataFrame(rows)\n", "\n", + "\n", "stats_easy = percentile_stats(\"error_easy\", \"Median (easy)\")\n", "stats_quick = percentile_stats(\"error_quick\", \"Best (quick)\")\n", "\n", @@ -1120,24 +1153,42 @@ } ], "source": [ - "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", - " \"Median transit time error (R5 − TfL)\",\n", - " \"Best transit time error (R5 − TfL)\"\n", - "])\n", + "fig = make_subplots(\n", + " rows=1,\n", + " cols=2,\n", + " subplot_titles=[\n", + " \"Median transit time error (R5 − TfL)\",\n", + " \"Best transit time error (R5 − TfL)\",\n", + " ],\n", + ")\n", "\n", "# Clip for readability\n", "easy_clipped = bank[\"error_easy\"].clip(-60, 60).to_numpy()\n", "quick_clipped = bank[\"error_quick\"].clip(-60, 60).to_numpy()\n", "\n", - "fig.add_trace(go.Histogram(x=easy_clipped, nbinsx=120, name=\"Median (easy)\",\n", - " marker_color=\"#0d9488\"), row=1, col=1)\n", - "fig.add_trace(go.Histogram(x=quick_clipped, nbinsx=120, name=\"Best (quick)\",\n", - " marker_color=\"#f59e0b\"), row=1, col=2)\n", + "fig.add_trace(\n", + " go.Histogram(\n", + " x=easy_clipped, nbinsx=120, name=\"Median (easy)\", marker_color=\"#0d9488\"\n", + " ),\n", + " row=1,\n", + " col=1,\n", + ")\n", + "fig.add_trace(\n", + " go.Histogram(\n", + " x=quick_clipped, nbinsx=120, name=\"Best (quick)\", marker_color=\"#f59e0b\"\n", + " ),\n", + " row=1,\n", + " col=2,\n", + ")\n", "\n", "fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=1)\n", "fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=2)\n", "fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", - "fig.update_layout(height=400, showlegend=False, title_text=\"Bank: Error Distribution (clipped ±60 min)\")\n", + "fig.update_layout(\n", + " height=400,\n", + " showlegend=False,\n", + " title_text=\"Bank: Error Distribution (clipped ±60 min)\",\n", + ")\n", "fig.show()" ] }, @@ -2104,34 +2155,55 @@ "# Sample for scatter plot performance\n", "sample = bank.sample(n=min(20_000, bank.shape[0]), seed=42)\n", "\n", - "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", - " \"Median: R5 vs TfL (easy)\",\n", - " \"Best: R5 vs TfL (quick)\"\n", - "])\n", + "fig = make_subplots(\n", + " rows=1,\n", + " cols=2,\n", + " subplot_titles=[\"Median: R5 vs TfL (easy)\", \"Best: R5 vs TfL (quick)\"],\n", + ")\n", "\n", - "fig.add_trace(go.Scattergl(\n", - " x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n", - " y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n", - " mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n", - " name=\"Median\"\n", - "), row=1, col=1)\n", + "fig.add_trace(\n", + " go.Scattergl(\n", + " x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n", + " y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n", + " mode=\"markers\",\n", + " marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n", + " name=\"Median\",\n", + " ),\n", + " row=1,\n", + " col=1,\n", + ")\n", "\n", - "fig.add_trace(go.Scattergl(\n", - " x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n", - " y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n", - " mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n", - " name=\"Best\"\n", - "), row=1, col=2)\n", + "fig.add_trace(\n", + " go.Scattergl(\n", + " x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n", + " y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n", + " mode=\"markers\",\n", + " marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n", + " name=\"Best\",\n", + " ),\n", + " row=1,\n", + " col=2,\n", + ")\n", "\n", "# Perfect agreement line\n", "for col in [1, 2]:\n", - " fig.add_trace(go.Scatter(x=[0, 200], y=[0, 200], mode=\"lines\",\n", - " line=dict(color=\"red\", dash=\"dash\", width=1),\n", - " showlegend=False), row=1, col=col)\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=[0, 200],\n", + " y=[0, 200],\n", + " mode=\"lines\",\n", + " line=dict(color=\"red\", dash=\"dash\", width=1),\n", + " showlegend=False,\n", + " ),\n", + " row=1,\n", + " col=col,\n", + " )\n", " fig.update_xaxes(title_text=\"TfL API (minutes)\", row=1, col=col)\n", " fig.update_yaxes(title_text=\"R5 (minutes)\", row=1, col=col)\n", "\n", - "fig.update_layout(height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\")\n", + "fig.update_layout(\n", + " height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\"\n", + ")\n", "fig.show()" ] }, @@ -403063,7 +403135,8 @@ "\n", "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\", lon=\"long\",\n", + " lat=\"lat\",\n", + " lon=\"long\",\n", " color=\"error_easy\",\n", " color_continuous_scale=\"RdBu_r\", # red=positive (R5 slower), blue=negative (R5 faster)\n", " range_color=[-30, 30],\n", @@ -403071,8 +403144,14 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Median transit error (R5 − TfL easy), minutes\",\n", - " hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n", - " \"error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n", + " hover_data={\n", + " \"pcds\": True,\n", + " \"travel_minutes\": True,\n", + " \"public_transport_easy_minutes\": True,\n", + " \"error_easy\": \":.0f\",\n", + " \"lat\": False,\n", + " \"long\": False,\n", + " },\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -803994,7 +804073,8 @@ "source": [ "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\", lon=\"long\",\n", + " lat=\"lat\",\n", + " lon=\"long\",\n", " color=\"error_quick\",\n", " color_continuous_scale=\"RdBu_r\",\n", " range_color=[-30, 30],\n", @@ -804002,8 +804082,14 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Best transit error (R5 − TfL quick), minutes\",\n", - " hover_data={\"pcds\": True, \"best_minutes\": True, \"public_transport_quick_minutes\": True,\n", - " \"error_quick\": \":.0f\", \"lat\": False, \"long\": False},\n", + " hover_data={\n", + " \"pcds\": True,\n", + " \"best_minutes\": True,\n", + " \"public_transport_quick_minutes\": True,\n", + " \"error_quick\": \":.0f\",\n", + " \"lat\": False,\n", + " \"long\": False,\n", + " },\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -1204925,7 +1205011,8 @@ "source": [ "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\", lon=\"long\",\n", + " lat=\"lat\",\n", + " lon=\"long\",\n", " color=\"abs_error_easy\",\n", " color_continuous_scale=\"YlOrRd\",\n", " range_color=[0, 30],\n", @@ -1204933,8 +1205020,14 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Absolute median transit error |R5 − TfL easy|, minutes\",\n", - " hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n", - " \"abs_error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n", + " hover_data={\n", + " \"pcds\": True,\n", + " \"travel_minutes\": True,\n", + " \"public_transport_easy_minutes\": True,\n", + " \"abs_error_easy\": \":.0f\",\n", + " \"lat\": False,\n", + " \"long\": False,\n", + " },\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -1204998,9 +1205091,15 @@ ], "source": [ "bank.sort(\"abs_error_easy\", descending=True).select(\n", - " \"pcds\", \"lat\", \"long\",\n", - " \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n", - " \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\",\n", + " \"pcds\",\n", + " \"lat\",\n", + " \"long\",\n", + " \"travel_minutes\",\n", + " \"public_transport_easy_minutes\",\n", + " \"error_easy\",\n", + " \"best_minutes\",\n", + " \"public_transport_quick_minutes\",\n", + " \"error_quick\",\n", ").head(30)" ] }, @@ -1205945,45 +1206044,75 @@ "\n", "dist_df = bank.with_columns(\n", " # Rough km distance using Haversine approximation\n", - " ((((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2 +\n", - " ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2) ** 0.5\n", + " (\n", + " (\n", + " ((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2\n", + " + ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2\n", + " )\n", + " ** 0.5\n", " ).alias(\"dist_km\")\n", ")\n", "\n", "# Bin by 5km\n", "binned = (\n", - " dist_df\n", - " .with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n", + " dist_df.with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n", " .group_by(\"dist_km\")\n", - " .agg([\n", - " pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n", - " pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n", - " pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n", - " pl.len().alias(\"count\"),\n", - " ])\n", + " .agg(\n", + " [\n", + " pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n", + " pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n", + " pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n", + " pl.len().alias(\"count\"),\n", + " ]\n", + " )\n", " .sort(\"dist_km\")\n", " .filter(pl.col(\"count\") > 50)\n", ")\n", "\n", - "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", - " \"Median signed error by distance\",\n", - " \"Median absolute error by distance\"\n", - "])\n", + "fig = make_subplots(\n", + " rows=1,\n", + " cols=2,\n", + " subplot_titles=[\n", + " \"Median signed error by distance\",\n", + " \"Median absolute error by distance\",\n", + " ],\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_easy\"].to_numpy(),\n", - " mode=\"lines+markers\", name=\"Easy\", line=dict(color=\"#0d9488\")\n", - "), row=1, col=1)\n", - "fig.add_trace(go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_quick\"].to_numpy(),\n", - " mode=\"lines+markers\", name=\"Quick\", line=dict(color=\"#f59e0b\")\n", - "), row=1, col=1)\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(),\n", + " y=binned[\"median_error_easy\"].to_numpy(),\n", + " mode=\"lines+markers\",\n", + " name=\"Easy\",\n", + " line=dict(color=\"#0d9488\"),\n", + " ),\n", + " row=1,\n", + " col=1,\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(),\n", + " y=binned[\"median_error_quick\"].to_numpy(),\n", + " mode=\"lines+markers\",\n", + " name=\"Quick\",\n", + " line=dict(color=\"#f59e0b\"),\n", + " ),\n", + " row=1,\n", + " col=1,\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_abs_error_easy\"].to_numpy(),\n", - " mode=\"lines+markers\", name=\"|Easy|\", line=dict(color=\"#0d9488\"),\n", - " showlegend=False\n", - "), row=1, col=2)\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(),\n", + " y=binned[\"median_abs_error_easy\"].to_numpy(),\n", + " mode=\"lines+markers\",\n", + " name=\"|Easy|\",\n", + " line=dict(color=\"#0d9488\"),\n", + " showlegend=False,\n", + " ),\n", + " row=1,\n", + " col=2,\n", + ")\n", "\n", "for col in [1, 2]:\n", " fig.update_xaxes(title_text=\"Distance from Bank (km)\", row=1, col=col)\n", diff --git a/finder/constants.py b/finder/constants.py index b42a961..5628c10 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -16,9 +16,21 @@ SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3")) # Whether to run a scrape immediately on startup RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes") # Enable/disable individual sources -SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ("1", "true", "yes") -SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ("1", "true", "yes") -SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ("1", "true", "yes") +SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ( + "1", + "true", + "yes", +) +SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ( + "1", + "true", + "yes", +) +SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ( + "1", + "true", + "yes", +) TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" diff --git a/finder/homecouk.py b/finder/homecouk.py index f6005fd..fc18fdf 100644 --- a/finder/homecouk.py +++ b/finder/homecouk.py @@ -86,7 +86,8 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None: log.info( "Cloudflare solved — got %d cookies, UA: %s", - len(cookies), user_agent[:60], + len(cookies), + user_agent[:60], ) flaresolverr_attempts_total.labels(result="success").inc() return cookies, user_agent @@ -129,11 +130,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session: Uses Chrome TLS impersonation so cf_clearance cookies (which are bound to Chrome's JA3 fingerprint from FlareSolverr) remain valid.""" session = Session(impersonate="chrome") - session.headers.update({ - "User-Agent": user_agent, - "Accept": "application/json, text/plain, */*", - "x-requested-with": "XMLHttpRequest", - }) + session.headers.update( + { + "User-Agent": user_agent, + "Accept": "application/json, text/plain, */*", + "x-requested-with": "XMLHttpRequest", + } + ) # Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the # X-XSRF-TOKEN request header (URL-decoded). Without this header, the # server rejects every request with 419/403. @@ -165,7 +168,11 @@ def fetch_page( return resp.json() except json.JSONDecodeError: homecouk_errors_total.labels(type="json_decode").inc() - log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?")) + log.error( + "Non-JSON response from %s (got %s)", + url, + resp.headers.get("content-type", "?"), + ) return None if resp.status_code == 403: raise CookiesExpiredError("HTTP 403 — cookies likely expired") @@ -173,7 +180,11 @@ def fetch_page( delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, url, attempt + 1, max_retries, delay, + resp.status_code, + url, + attempt + 1, + max_retries, + delay, ) time.sleep(delay) continue @@ -186,7 +197,11 @@ def fetch_page( delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, url, attempt + 1, max_retries, delay, + type(e).__name__, + url, + attempt + 1, + max_retries, + delay, ) time.sleep(delay) homecouk_errors_total.labels(type="retry_exhausted").inc() @@ -218,7 +233,12 @@ def map_property_type(raw_type: str | None) -> str: # Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc. # Try common patterns lower = raw_type.lower() - if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: + if ( + "flat" in lower + or "apartment" in lower + or "maisonette" in lower + or "studio" in lower + ): return "Flats/Maisonettes" if "detached" in lower and "semi" not in lower: return "Detached" @@ -231,7 +251,9 @@ def map_property_type(raw_type: str | None) -> str: def transform_property( - prop: dict, channel: str, pc_index: PostcodeSpatialIndex, + prop: dict, + channel: str, + pc_index: PostcodeSpatialIndex, ) -> dict | None: """Transform a raw home.co.uk property dict into our output schema.""" lat = prop.get("latitude") diff --git a/finder/http_client.py b/finder/http_client.py index ecc993f..64be33b 100644 --- a/finder/http_client.py +++ b/finder/http_client.py @@ -11,7 +11,9 @@ from metrics import http_errors_total, http_requests_total, ip_rotations_total log = logging.getLogger("rightmove") -_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0) +_ua = UserAgent( + browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0 +) def _endpoint_label(url: str) -> str: @@ -27,6 +29,7 @@ def _status_label(code: int) -> str: return "5xx" return str(code) + # Gluetun control API — runs on port 8000 inside the gluetun container. # Since finder uses network_mode: service:gluetun, localhost IS gluetun. GLUETUN_API = "http://127.0.0.1:8000" @@ -42,17 +45,25 @@ def rotate_ip() -> bool: # Get current IP with httpx.Client(timeout=10) as ctl: old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip") - old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown" + old_ip = ( + old_ip_resp.json().get("public_ip", "unknown") + if old_ip_resp.status_code == 200 + else "unknown" + ) log.info("Current IP: %s", old_ip) # Trigger server change — PUT with empty JSON body picks a random server - resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}) + resp = ctl.put( + f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"} + ) if resp.status_code != 200: log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text) return False time.sleep(2) - resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}) + resp = ctl.put( + f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"} + ) if resp.status_code != 200: log.error("Failed to start VPN: %d %s", resp.status_code, resp.text) return False @@ -99,7 +110,9 @@ def fetch_with_retry( for attempt in range(MAX_RETRIES): try: resp = client.get(url, params=params) - http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc() + http_requests_total.labels( + status=_status_label(resp.status_code), endpoint=endpoint + ).inc() if resp.status_code == 200: return resp.json() if resp.status_code == 403 and on_403: @@ -111,15 +124,34 @@ def fetch_with_retry( return None if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay) + log.warning( + "HTTP %d from %s, retry %d/%d in %.1fs", + resp.status_code, + url, + attempt + 1, + MAX_RETRIES, + delay, + ) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None - except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e: + except ( + httpx.ConnectError, + httpx.ReadTimeout, + httpx.WriteTimeout, + httpx.PoolTimeout, + ) as e: http_errors_total.labels(type=type(e).__name__).inc() delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay) + log.warning( + "%s from %s, retry %d/%d in %.1fs", + type(e).__name__, + url, + attempt + 1, + MAX_RETRIES, + delay, + ) time.sleep(delay) http_errors_total.labels(type="retry_exhausted").inc() log.error("All %d retries exhausted for %s", MAX_RETRIES, url) diff --git a/finder/main.py b/finder/main.py index 666033b..3174d00 100644 --- a/finder/main.py +++ b/finder/main.py @@ -7,7 +7,14 @@ from pathlib import Path from flask import Flask, Response, jsonify, send_from_directory from prometheus_client import generate_latest, CONTENT_TYPE_LATEST -from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE +from constants import ( + DATA_DIR, + RUN_ON_STARTUP, + SCHEDULE_HOUR, + SCRAPE_HOMECOUK, + SCRAPE_OPENRENT, + SCRAPE_RIGHTMOVE, +) from homecouk import load_cookies as load_homecouk_cookies from openrent import load_cookies as load_openrent_cookies from rightmove import outcode_cache @@ -49,8 +56,13 @@ log.info("Loading arcgis data...") OUTCODES = load_outcodes() PC_INDEX = build_postcode_index() PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None -log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)", - len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT) +log.info( + "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)", + len(OUTCODES), + SCRAPE_RIGHTMOVE, + SCRAPE_HOMECOUK, + SCRAPE_OPENRENT, +) # --------------------------------------------------------------------------- # Scheduler @@ -63,7 +75,9 @@ def _start_scrape() -> bool: if status.state == "running": return False status.state = "running" - thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True) + thread = threading.Thread( + target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True + ) thread.start() return True @@ -82,7 +96,9 @@ def _scheduler_loop() -> None: log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR) while True: wait = _seconds_until(SCHEDULE_HOUR) - log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600) + log.info( + "Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600 + ) time.sleep(wait) log.info("Scheduled scrape triggered") if not _start_scrape(): @@ -144,15 +160,17 @@ def get_status(): def get_debug(): hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None - return jsonify({ - "outcode_cache_size": len(outcode_cache), - "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), - "scrape_rightmove": SCRAPE_RIGHTMOVE, - "scrape_homecouk": SCRAPE_HOMECOUK, - "scrape_openrent": SCRAPE_OPENRENT, - "homecouk_cookies_available": hk_cookies is not None, - "openrent_cookies_available": or_cookies is not None, - }) + return jsonify( + { + "outcode_cache_size": len(outcode_cache), + "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), + "scrape_rightmove": SCRAPE_RIGHTMOVE, + "scrape_homecouk": SCRAPE_HOMECOUK, + "scrape_openrent": SCRAPE_OPENRENT, + "homecouk_cookies_available": hk_cookies is not None, + "openrent_cookies_available": or_cookies is not None, + } + ) @app.route("/metrics") diff --git a/finder/openrent.py b/finder/openrent.py index d66d0f6..f7da645 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -79,7 +79,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None: if "AwsWafIntegration" in content: log.info("Got WAF challenge page, waiting for resolution...") page.wait_for_selector( - "a.pli, .pli, .search-property-card", timeout=30000, + "a.pli, .pli, .search-property-card", + timeout=30000, ) raw_cookies = context.cookies() @@ -94,7 +95,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None: log.info( "AWS WAF solved — got %d cookies, UA: %s", - len(cookies), user_agent[:60], + len(cookies), + user_agent[:60], ) flaresolverr_attempts_total.labels(result="success").inc() return cookies, user_agent @@ -130,11 +132,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session: """Create a curl_cffi Session configured for OpenRent. Uses Chrome TLS impersonation so AWS WAF cookies remain valid.""" session = Session(impersonate="chrome") - session.headers.update({ - "User-Agent": user_agent, - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-GB,en;q=0.9", - }) + session.headers.update( + { + "User-Agent": user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + } + ) for name, value in cookies.items(): session.cookies.set(name, value, domain="openrent.co.uk") return session @@ -152,7 +156,9 @@ def _status_label(code: int) -> str: def fetch_page( - client: Session, url: str, max_retries: int = 3, + client: Session, + url: str, + max_retries: int = 3, ) -> str | None: """GET HTML with retries on 429/5xx. Returns None on permanent failure. WAF challenge (202 or 403 with challenge JS) raises WafChallengeError.""" @@ -165,17 +171,25 @@ def fetch_page( html = resp.text # Detect WAF challenge page masquerading as 200 if "AwsWafIntegration" in html and "challenge.js" in html: - raise WafChallengeError("Got AWS WAF challenge page — cookies expired") + raise WafChallengeError( + "Got AWS WAF challenge page — cookies expired" + ) return html if resp.status_code in (202, 403): - raise WafChallengeError(f"HTTP {resp.status_code} — cookies likely expired") + raise WafChallengeError( + f"HTTP {resp.status_code} — cookies likely expired" + ) if resp.status_code in (429, 500, 502, 503, 504): - delay = RETRY_BASE_DELAY * (2 ** attempt) + delay = RETRY_BASE_DELAY * (2**attempt) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, url, attempt + 1, max_retries, delay, + resp.status_code, + url, + attempt + 1, + max_retries, + delay, ) time.sleep(delay) continue @@ -187,10 +201,14 @@ def fetch_page( raise except RequestsError as e: openrent_errors_total.labels(type=type(e).__name__).inc() - delay = RETRY_BASE_DELAY * (2 ** attempt) + delay = RETRY_BASE_DELAY * (2**attempt) log.warning( "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, url, attempt + 1, max_retries, delay, + type(e).__name__, + url, + attempt + 1, + max_retries, + delay, ) time.sleep(delay) @@ -247,7 +265,9 @@ def _extract_bedrooms_from_title(title: str) -> int | None: return None -def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]: +def _extract_beds_baths_from_features( + feature_items: list, +) -> tuple[int | None, int | None]: """Extract bedrooms and bathrooms from feature list items. OpenRent search cards have