diff --git a/Dockerfile b/Dockerfile index 481d385..4f6c9a1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Stage 1: Build frontend -FROM node:22-slim AS frontend +FROM node:20-slim AS frontend WORKDIR /app/frontend COPY frontend/package.json frontend/package-lock.json ./ RUN npm ci @@ -7,7 +7,7 @@ COPY frontend/ ./ RUN npm run build:no-prerender # Stage 2: Build Rust server -FROM rust:1.84-bookworm AS server +FROM rust:1.83-bookworm AS server WORKDIR /app COPY server-rs/ server-rs/ WORKDIR /app/server-rs diff --git a/analyses/bank_postcode_boundaries.ipynb b/analyses/bank_postcode_boundaries.ipynb index abd5a95..b4197f3 100644 --- a/analyses/bank_postcode_boundaries.ipynb +++ b/analyses/bank_postcode_boundaries.ipynb @@ -813,14 +813,8 @@ ], "source": [ "# Build area lookup from both sets\n", - "areas_before = {\n", - " f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"]\n", - " for f in no_green[\"features\"]\n", - "}\n", - "areas_after = {\n", - " f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"]\n", - " for f in with_green[\"features\"]\n", - "}\n", + "areas_before = {f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"] for f in no_green[\"features\"]}\n", + "areas_after = {f[\"properties\"][\"postcode\"]: f[\"properties\"][\"area_sqm\"] for f in with_green[\"features\"]}\n", "\n", "# Compute percentage removed\n", "diffs = []\n", @@ -1167,23 +1161,16 @@ "\n", "colormap = cm.LinearColormap(\n", " colors=[\"#ffffcc\", \"#fd8d3c\", \"#e31a1c\", \"#800026\"],\n", - " vmin=0,\n", - " vmax=min(max_pct, 90),\n", + " vmin=0, vmax=min(max_pct, 90),\n", " caption=\"% area removed by greenspace\",\n", ")\n", "\n", - "\n", "# Show original boundaries, colored by how much was removed\n", "def style_by_removal(feature):\n", " pc = feature[\"properties\"][\"postcode\"]\n", " pct = diff_lookup.get(pc, 0)\n", " if pct <= 1:\n", - " return {\n", - " \"fillColor\": \"#cccccc\",\n", - " \"color\": \"#999\",\n", - " \"weight\": 0.5,\n", - " \"fillOpacity\": 0.15,\n", - " }\n", + " return {\"fillColor\": \"#cccccc\", \"color\": \"#999\", \"weight\": 0.5, \"fillOpacity\": 0.15}\n", " return {\n", " \"fillColor\": colormap(min(pct, 90)),\n", " \"color\": \"white\",\n", @@ -1191,7 +1178,6 @@ " \"fillOpacity\": 0.6,\n", " }\n", "\n", - "\n", "folium.GeoJson(\n", " no_green,\n", " name=\"Greenspace removal %\",\n", diff --git a/analyses/price_model_evaluation.ipynb b/analyses/price_model_evaluation.ipynb index 2bce6b3..d8cf9cc 100644 --- a/analyses/price_model_evaluation.ipynb +++ b/analyses/price_model_evaluation.ipynb @@ -54,32 +54,25 @@ " ape = np.abs(p - a) / a\n", " err = p - a\n", " return {\n", - " \"MdAPE (%)\": f\"{np.median(ape) * 100:.1f}\",\n", - " \"% within 10%\": f\"{np.mean(ape <= 0.10) * 100:.1f}\",\n", - " \"% within 20%\": f\"{np.mean(ape <= 0.20) * 100:.1f}\",\n", - " \"% within 30%\": f\"{np.mean(ape <= 0.30) * 100:.1f}\",\n", + " \"MdAPE (%)\": f\"{np.median(ape)*100:.1f}\",\n", + " \"% within 10%\": f\"{np.mean(ape <= 0.10)*100:.1f}\",\n", + " \"% within 20%\": f\"{np.mean(ape <= 0.20)*100:.1f}\",\n", + " \"% within 30%\": f\"{np.mean(ape <= 0.30)*100:.1f}\",\n", " \"MAE (\\u00a3)\": f\"{np.mean(np.abs(err)):,.0f}\",\n", " \"Mean signed error (\\u00a3)\": f\"{np.mean(err):+,.0f}\",\n", " \"n\": f\"{len(a):,}\",\n", " }\n", "\n", - "\n", "actual = backtest_df[\"actual_price\"].to_numpy().astype(np.float64)\n", "metrics = {\n", - " \"Naive\": compute_metrics(\n", - " actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)\n", - " ),\n", - " \"Index\": compute_metrics(\n", - " actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)\n", - " ),\n", + " \"Naive\": compute_metrics(actual, backtest_df[\"input_price\"].to_numpy().astype(np.float64)),\n", + " \"Index\": compute_metrics(actual, backtest_df[\"predicted\"].to_numpy().astype(np.float64)),\n", "}\n", "\n", - "metrics_table = pl.DataFrame(\n", - " [\n", - " {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n", - " for k in list(metrics[\"Naive\"].keys())\n", - " ]\n", - ")\n", + "metrics_table = pl.DataFrame([\n", + " {\"Metric\": k, **{stage: v[k] for stage, v in metrics.items()}}\n", + " for k in list(metrics[\"Naive\"].keys())\n", + "])\n", "metrics_table" ] }, @@ -98,7 +91,8 @@ "source": [ "# National index (average across all sectors weighted by n_pairs)\n", "national = (\n", - " index_df.group_by(\"year\")\n", + " index_df\n", + " .group_by(\"year\")\n", " .agg(\n", " (pl.col(\"log_index\") * pl.col(\"n_pairs\")).sum() / pl.col(\"n_pairs\").sum(),\n", " )\n", @@ -113,23 +107,14 @@ "\n", "# If not enough, pick some with high/low n_pairs\n", "if len(sample_sectors) < 3:\n", - " sector_counts = (\n", - " index_df.group_by(\"sector\")\n", - " .agg(pl.col(\"n_pairs\").first())\n", - " .sort(\"n_pairs\", descending=True)\n", - " )\n", + " sector_counts = index_df.group_by(\"sector\").agg(pl.col(\"n_pairs\").first()).sort(\"n_pairs\", descending=True)\n", " top = sector_counts.head(2)[\"sector\"].to_list()\n", " bottom = sector_counts.filter(pl.col(\"n_pairs\") > 0).tail(2)[\"sector\"].to_list()\n", " sample_sectors = list(set(sample_sectors + top + bottom))[:5]\n", "\n", "samples = index_df.filter(pl.col(\"sector\").is_in(sample_sectors))\n", "\n", - "combined = pl.concat(\n", - " [\n", - " national.select(\"sector\", \"year\", \"log_index\"),\n", - " samples.select(\"sector\", \"year\", \"log_index\"),\n", - " ]\n", - ")\n", + "combined = pl.concat([national.select(\"sector\", \"year\", \"log_index\"), samples.select(\"sector\", \"year\", \"log_index\")])\n", "\n", "# Normalize: index = 100 at base year (earliest available)\n", "combined = combined.with_columns(\n", @@ -137,10 +122,7 @@ ")\n", "\n", "fig = px.line(\n", - " combined.to_pandas(),\n", - " x=\"year\",\n", - " y=\"index_100\",\n", - " color=\"sector\",\n", + " combined.to_pandas(), x=\"year\", y=\"index_100\", color=\"sector\",\n", " title=\"Repeat-Sales Price Index (base year = 100)\",\n", " labels={\"index_100\": \"Index (base=100)\", \"year\": \"Year\"},\n", ")\n", @@ -173,10 +155,8 @@ "\n", "fig.update_layout(\n", " title=\"Absolute Percentage Error Distribution\",\n", - " xaxis_title=\"APE (%)\",\n", - " yaxis_title=\"Count\",\n", - " barmode=\"overlay\",\n", - " height=500,\n", + " xaxis_title=\"APE (%)\", yaxis_title=\"Count\",\n", + " barmode=\"overlay\", height=500,\n", ")\n", "fig.show()" ] @@ -203,27 +183,17 @@ "pred = sample[\"predicted\"].to_numpy().astype(np.float64)\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(\n", - " go.Scattergl(\n", - " x=actual_sample,\n", - " y=pred,\n", - " mode=\"markers\",\n", - " marker=dict(size=2, opacity=0.3),\n", - " name=\"Index\",\n", - " )\n", - ")\n", + "fig.add_trace(go.Scattergl(\n", + " x=actual_sample, y=pred, mode=\"markers\",\n", + " marker=dict(size=2, opacity=0.3), name=\"Index\",\n", + "))\n", "# 45-degree reference line\n", "min_val = max(10_000, min(actual_sample.min(), np.nanmin(pred)))\n", "max_val = min(5_000_000, max(actual_sample.max(), np.nanmax(pred)))\n", - "fig.add_trace(\n", - " go.Scatter(\n", - " x=[min_val, max_val],\n", - " y=[min_val, max_val],\n", - " mode=\"lines\",\n", - " line=dict(color=\"red\", dash=\"dash\"),\n", - " showlegend=False,\n", - " )\n", - ")\n", + "fig.add_trace(go.Scatter(\n", + " x=[min_val, max_val], y=[min_val, max_val],\n", + " mode=\"lines\", line=dict(color=\"red\", dash=\"dash\"), showlegend=False,\n", + "))\n", "fig.update_xaxes(type=\"log\", title_text=\"Actual (\\u00a3)\")\n", "fig.update_yaxes(type=\"log\", title_text=\"Predicted (\\u00a3)\")\n", "fig.update_layout(title=\"Predicted vs Actual Price (log scale, 10K sample)\", height=500)\n", @@ -264,22 +234,12 @@ " for name, arr in [(\"Naive\", naive), (\"Index\", pred)]:\n", " ape = np.abs(arr[mask] - actual[mask]) / actual[mask]\n", " valid = np.isfinite(ape)\n", - " rows.append(\n", - " {\n", - " \"Price Band\": label,\n", - " \"Method\": name,\n", - " \"MdAPE (%)\": float(np.median(ape[valid]) * 100),\n", - " }\n", - " )\n", + " rows.append({\"Price Band\": label, \"Method\": name, \"MdAPE (%)\": float(np.median(ape[valid]) * 100)})\n", "\n", "band_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", - " band_df.to_pandas(),\n", - " x=\"Price Band\",\n", - " y=\"MdAPE (%)\",\n", - " color=\"Method\",\n", - " barmode=\"group\",\n", - " title=\"MdAPE by Price Band\",\n", + " band_df.to_pandas(), x=\"Price Band\", y=\"MdAPE (%)\", color=\"Method\",\n", + " barmode=\"group\", title=\"MdAPE by Price Band\",\n", " category_orders={\"Price Band\": [b[2] for b in bands]},\n", ")\n", "fig.update_layout(height=450)\n", @@ -304,9 +264,7 @@ ")\n", "\n", "# Top 20 areas by volume\n", - "top_areas = (\n", - " bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n", - ")\n", + "top_areas = bt.group_by(\"area\").len().sort(\"len\", descending=True).head(20)[\"area\"].to_list()\n", "\n", "actual_np = bt[\"actual_price\"].to_numpy().astype(np.float64)\n", "pred_np = bt[\"predicted\"].to_numpy().astype(np.float64)\n", @@ -321,18 +279,12 @@ " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", - " rows.append(\n", - " {\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)}\n", - " )\n", + " rows.append({\"Area\": area, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", "\n", "area_df = pl.DataFrame(rows)\n", "fig = px.bar(\n", - " area_df.to_pandas(),\n", - " x=\"Area\",\n", - " y=\"MdAPE (%)\",\n", - " color=\"Method\",\n", - " barmode=\"group\",\n", - " title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n", + " area_df.to_pandas(), x=\"Area\", y=\"MdAPE (%)\", color=\"Method\",\n", + " barmode=\"group\", title=\"MdAPE by Postcode Area (Top 20 by Volume)\",\n", " category_orders={\"Area\": top_areas},\n", ")\n", "fig.update_layout(height=500)\n", @@ -372,20 +324,11 @@ " p = arr[mask]\n", " valid = np.isfinite(p) & (a > 0)\n", " ape = np.abs(p[valid] - a[valid]) / a[valid]\n", - " rows.append(\n", - " {\n", - " \"Gap (years)\": gap,\n", - " \"Method\": name,\n", - " \"MdAPE (%)\": float(np.median(ape) * 100),\n", - " }\n", - " )\n", + " rows.append({\"Gap (years)\": gap, \"Method\": name, \"MdAPE (%)\": float(np.median(ape) * 100)})\n", "\n", "gap_df = pl.DataFrame(rows)\n", "fig = px.line(\n", - " gap_df.to_pandas(),\n", - " x=\"Gap (years)\",\n", - " y=\"MdAPE (%)\",\n", - " color=\"Method\",\n", + " gap_df.to_pandas(), x=\"Gap (years)\", y=\"MdAPE (%)\", color=\"Method\",\n", " title=\"MdAPE by Holding Period (years between input and actual sale)\",\n", " markers=True,\n", ")\n", diff --git a/analyses/rightmove_buy.ipynb b/analyses/rightmove_buy.ipynb index 97a0839..cf806cf 100644 --- a/analyses/rightmove_buy.ipynb +++ b/analyses/rightmove_buy.ipynb @@ -52,9 +52,7 @@ "pl.Config.set_tbl_rows(20)\n", "pl.Config.set_fmt_str_lengths(80)\n", "\n", - "df = pl.read_parquet(\n", - " \"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\"\n", - ")\n", + "df = pl.read_parquet(\"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\")\n", "schema = df.schema\n", "print(f\"Total rows: {len(df):,}\")\n", "print(f\"Columns ({len(schema)}):\")\n", @@ -152,13 +150,11 @@ ], "source": [ "# Null counts\n", - "null_df = pl.DataFrame(\n", - " {\n", - " \"column\": df.columns,\n", - " \"nulls\": [df[c].null_count() for c in df.columns],\n", - " \"pct\": [f\"{df[c].null_count() / len(df) * 100:.1f}%\" for c in df.columns],\n", - " }\n", - ")\n", + "null_df = pl.DataFrame({\n", + " \"column\": df.columns,\n", + " \"nulls\": [df[c].null_count() for c in df.columns],\n", + " \"pct\": [f\"{df[c].null_count()/len(df)*100:.1f}%\" for c in df.columns],\n", + "})\n", "null_df.filter(pl.col(\"nulls\") > 0)" ] }, @@ -201,17 +197,13 @@ " \"price = 0\": len(df.filter(pl.col(\"price\") == 0)),\n", " \"price > 50M\": len(df.filter(pl.col(\"price\") > 50_000_000)),\n", " \"floorspace > 10,000 sqm\": len(df.filter(pl.col(\"floorspace_sqm\") > 10_000)),\n", - " \"latitude outside UK (< 49 or > 61)\": len(\n", - " df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))\n", - " ),\n", - " \"longitude outside UK (< -8 or > 2)\": len(\n", - " df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))\n", - " ),\n", + " \"latitude outside UK (< 49 or > 61)\": len(df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))),\n", + " \"longitude outside UK (< -8 or > 2)\": len(df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))),\n", " \"house_share = true\": len(df.filter(pl.col(\"house_share\"))),\n", "}\n", "print(\"Data quality issues:\")\n", "for desc, count in issues.items():\n", - " print(f\" {desc}: {count:,} ({count / len(df) * 100:.2f}%)\")" + " print(f\" {desc}: {count:,} ({count/len(df)*100:.2f}%)\")" ] }, { @@ -238,7 +230,7 @@ " & (pl.col(\"longitude\") >= -8)\n", " & (pl.col(\"longitude\") <= 2)\n", ")\n", - "print(f\"Clean rows: {len(clean):,} ({len(clean) / len(df) * 100:.1f}% of original)\")" + "print(f\"Clean rows: {len(clean):,} ({len(clean)/len(df)*100:.1f}% of original)\")" ] }, { @@ -1134,12 +1126,8 @@ "# Price histogram (clipped to 2nd-98th percentile)\n", "lo, hi = price.quantile(0.02), price.quantile(0.98)\n", "clipped = clean.filter((pl.col(\"price\") >= lo) & (pl.col(\"price\") <= hi))\n", - "fig = px.histogram(\n", - " clipped.to_pandas(),\n", - " x=\"price\",\n", - " nbins=80,\n", - " title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\",\n", - ")\n", + "fig = px.histogram(clipped.to_pandas(), x=\"price\", nbins=80,\n", + " title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\")\n", "fig.update_layout(height=400, xaxis_title=\"Asking Price (£)\", yaxis_title=\"Count\")\n", "fig.show()" ] @@ -439990,13 +439978,9 @@ ], "source": [ "# Price by property type\n", - "fig = px.box(\n", - " clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n", - " x=\"property_type\",\n", - " y=\"price\",\n", - " color=\"property_type\",\n", - " title=\"Price by Property Type (capped at £2M for readability)\",\n", - ")\n", + "fig = px.box(clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n", + " x=\"property_type\", y=\"price\", color=\"property_type\",\n", + " title=\"Price by Property Type (capped at £2M for readability)\")\n", "fig.update_layout(height=500, showlegend=False, yaxis_title=\"Price (£)\")\n", "fig.show()" ] @@ -440095,7 +440079,9 @@ "source": [ "# Price qualifier breakdown\n", "pq = clean[\"price_qualifier\"].value_counts().sort(\"count\", descending=True)\n", - "pq = pq.with_columns((pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\"))\n", + "pq = pq.with_columns(\n", + " (pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\")\n", + ")\n", "pq" ] }, @@ -440942,12 +440928,8 @@ "source": [ "# Property type distribution\n", "type_counts = clean[\"property_type\"].value_counts().sort(\"count\", descending=True)\n", - "fig = px.pie(\n", - " type_counts.to_pandas(),\n", - " names=\"property_type\",\n", - " values=\"count\",\n", - " title=\"Property Type Distribution\",\n", - ")\n", + "fig = px.pie(type_counts.to_pandas(), names=\"property_type\", values=\"count\",\n", + " title=\"Property Type Distribution\")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -441823,16 +441805,9 @@ ], "source": [ "# Top 20 sub-types\n", - "sub_counts = (\n", - " clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n", - ")\n", - "fig = px.bar(\n", - " sub_counts.to_pandas(),\n", - " x=\"count\",\n", - " y=\"property_sub_type\",\n", - " orientation=\"h\",\n", - " title=\"Top 20 Property Sub-types\",\n", - ")\n", + "sub_counts = clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n", + "fig = px.bar(sub_counts.to_pandas(), x=\"count\", y=\"property_sub_type\", orientation=\"h\",\n", + " title=\"Top 20 Property Sub-types\")\n", "fig.update_layout(height=600, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -442668,15 +442643,9 @@ ], "source": [ "# Tenure split\n", - "tenure_counts = (\n", - " clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n", - ")\n", - "fig = px.pie(\n", - " tenure_counts.to_pandas(),\n", - " names=\"tenure\",\n", - " values=\"count\",\n", - " title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count() / len(clean) * 100:.1f}% missing)\",\n", - ")\n", + "tenure_counts = clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n", + "fig = px.pie(tenure_counts.to_pandas(), names=\"tenure\", values=\"count\",\n", + " title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count()/len(clean)*100:.1f}% missing)\")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -443577,14 +443546,8 @@ " .agg(pl.len().alias(\"count\"))\n", " .sort(\"property_type\")\n", ")\n", - "fig = px.bar(\n", - " tenure_by_type.to_pandas(),\n", - " x=\"property_type\",\n", - " y=\"count\",\n", - " color=\"tenure\",\n", - " barmode=\"group\",\n", - " title=\"Tenure by Property Type\",\n", - ")\n", + "fig = px.bar(tenure_by_type.to_pandas(), x=\"property_type\", y=\"count\", color=\"tenure\",\n", + " barmode=\"group\", title=\"Tenure by Property Type\")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -444449,12 +444412,9 @@ ], "source": [ "# Bedroom distribution\n", - "bed_counts = (\n", - " clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n", - ")\n", - "fig = px.bar(\n", - " bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\", title=\"Bedroom Count Distribution\"\n", - ")\n", + "bed_counts = clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n", + "fig = px.bar(bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\",\n", + " title=\"Bedroom Count Distribution\")\n", "fig.update_layout(height=400)\n", "fig.show()" ] @@ -445319,25 +445279,16 @@ ")\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(\n", - " go.Bar(\n", - " x=price_by_beds[\"bedrooms\"],\n", - " y=price_by_beds[\"median_price\"],\n", - " name=\"Median\",\n", - " error_y=dict(\n", - " type=\"data\",\n", - " symmetric=False,\n", - " array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n", - " arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list(),\n", - " ),\n", + "fig.add_trace(go.Bar(\n", + " x=price_by_beds[\"bedrooms\"], y=price_by_beds[\"median_price\"],\n", + " name=\"Median\", error_y=dict(type=\"data\",\n", + " symmetric=False,\n", + " array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n", + " arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list()\n", " )\n", - ")\n", - "fig.update_layout(\n", - " title=\"Median Price by Bedrooms (with IQR)\",\n", - " height=400,\n", - " xaxis_title=\"Bedrooms\",\n", - " yaxis_title=\"Price (£)\",\n", - ")\n", + "))\n", + "fig.update_layout(title=\"Median Price by Bedrooms (with IQR)\", height=400,\n", + " xaxis_title=\"Bedrooms\", yaxis_title=\"Price (£)\")\n", "fig.show()" ] }, @@ -446312,14 +446263,8 @@ " .agg(pl.len().alias(\"count\"))\n", " .sort(\"property_type\", \"bedrooms\")\n", ")\n", - "fig = px.bar(\n", - " beds_by_type.to_pandas(),\n", - " x=\"bedrooms\",\n", - " y=\"count\",\n", - " color=\"property_type\",\n", - " barmode=\"group\",\n", - " title=\"Bedroom Distribution by Property Type\",\n", - ")\n", + "fig = px.bar(beds_by_type.to_pandas(), x=\"bedrooms\", y=\"count\", color=\"property_type\",\n", + " barmode=\"group\", title=\"Bedroom Distribution by Property Type\")\n", "fig.update_layout(height=450)\n", "fig.show()" ] @@ -446378,26 +446323,19 @@ ], "source": [ "# Floorspace availability by property type\n", - "has_floor = clean.with_columns(\n", - " pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\")\n", - ")\n", - "floor_by_type = has_floor.group_by(\"property_type\", \"has_floorspace\").agg(\n", - " pl.len().alias(\"count\")\n", - ")\n", - "totals = floor_by_type.group_by(\"property_type\").agg(\n", - " pl.col(\"count\").sum().alias(\"total\")\n", + "has_floor = clean.with_columns(pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\"))\n", + "floor_by_type = (\n", + " has_floor.group_by(\"property_type\", \"has_floorspace\")\n", + " .agg(pl.len().alias(\"count\"))\n", ")\n", + "totals = floor_by_type.group_by(\"property_type\").agg(pl.col(\"count\").sum().alias(\"total\"))\n", "floor_pct = (\n", " floor_by_type.filter(pl.col(\"has_floorspace\"))\n", " .join(totals, on=\"property_type\")\n", - " .with_columns(\n", - " (pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\")\n", - " )\n", + " .with_columns((pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\"))\n", " .sort(\"pct_with_floorspace\", descending=True)\n", ")\n", - "print(\n", - " f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%)\"\n", - ")\n", + "print(f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%)\")\n", "floor_pct.select(\"property_type\", \"count\", \"total\", \"pct_with_floorspace\")" ] }, @@ -447360,13 +447298,8 @@ ")\n", "print(f\"Properties with reasonable floorspace (10-1000 sqm): {len(with_floor):,}\")\n", "\n", - "fig = px.histogram(\n", - " with_floor.to_pandas(),\n", - " x=\"floorspace_sqm\",\n", - " nbins=80,\n", - " color=\"property_type\",\n", - " title=\"Floorspace Distribution by Property Type\",\n", - ")\n", + "fig = px.histogram(with_floor.to_pandas(), x=\"floorspace_sqm\", nbins=80, color=\"property_type\",\n", + " title=\"Floorspace Distribution by Property Type\")\n", "fig.update_layout(height=450, xaxis_title=\"Floorspace (sqm)\", barmode=\"overlay\")\n", "fig.update_traces(opacity=0.6)\n", "fig.show()" @@ -448243,12 +448176,8 @@ "print(f\" P25: £{s.quantile(0.25):,.0f}/sqm\")\n", "print(f\" P75: £{s.quantile(0.75):,.0f}/sqm\")\n", "\n", - "fig = px.histogram(\n", - " ppsqm.to_pandas(),\n", - " x=\"price_per_sqm\",\n", - " nbins=80,\n", - " title=\"Price per Square Metre Distribution\",\n", - ")\n", + "fig = px.histogram(ppsqm.to_pandas(), x=\"price_per_sqm\", nbins=80,\n", + " title=\"Price per Square Metre Distribution\")\n", "fig.update_layout(height=400, xaxis_title=\"Price per sqm (£)\")\n", "fig.show()" ] @@ -584977,13 +584906,8 @@ } ], "source": [ - "fig = px.box(\n", - " ppsqm.to_pandas(),\n", - " x=\"property_type\",\n", - " y=\"price_per_sqm\",\n", - " color=\"property_type\",\n", - " title=\"Price per sqm by Property Type\",\n", - ")\n", + "fig = px.box(ppsqm.to_pandas(), x=\"property_type\", y=\"price_per_sqm\", color=\"property_type\",\n", + " title=\"Price per sqm by Property Type\")\n", "fig.update_layout(height=450, showlegend=False, yaxis_title=\"£ per sqm\")\n", "fig.show()" ] @@ -585941,15 +585865,9 @@ ")\n", "\n", "top30 = outcode_stats.head(30)\n", - "fig = px.bar(\n", - " top30.to_pandas(),\n", - " x=\"count\",\n", - " y=\"outcode\",\n", - " orientation=\"h\",\n", - " color=\"median_price\",\n", - " color_continuous_scale=\"Viridis\",\n", - " title=\"Top 30 Outcodes by Listing Volume\",\n", - ")\n", + "fig = px.bar(top30.to_pandas(), x=\"count\", y=\"outcode\", orientation=\"h\",\n", + " color=\"median_price\", color_continuous_scale=\"Viridis\",\n", + " title=\"Top 30 Outcodes by Listing Volume\")\n", "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -587482,25 +587400,11 @@ ], "source": [ "# Most expensive outcodes (min 50 listings)\n", - "expensive = (\n", - " outcode_stats.filter(pl.col(\"count\") >= 50)\n", - " .sort(\"median_price\", descending=True)\n", - " .head(30)\n", - ")\n", - "fig = px.bar(\n", - " expensive.to_pandas(),\n", - " x=\"median_price\",\n", - " y=\"outcode\",\n", - " orientation=\"h\",\n", - " color=\"count\",\n", - " color_continuous_scale=\"Blues\",\n", - " title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\",\n", - ")\n", - "fig.update_layout(\n", - " height=700,\n", - " yaxis={\"categoryorder\": \"total ascending\"},\n", - " xaxis_title=\"Median Price (£)\",\n", - ")\n", + "expensive = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\", descending=True).head(30)\n", + "fig = px.bar(expensive.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n", + " color=\"count\", color_continuous_scale=\"Blues\",\n", + " title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\")\n", + "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"}, xaxis_title=\"Median Price (£)\")\n", "fig.show()" ] }, @@ -589010,20 +588914,10 @@ "source": [ "# Cheapest outcodes (min 50 listings)\n", "cheapest = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\").head(30)\n", - "fig = px.bar(\n", - " cheapest.to_pandas(),\n", - " x=\"median_price\",\n", - " y=\"outcode\",\n", - " orientation=\"h\",\n", - " color=\"count\",\n", - " color_continuous_scale=\"Blues\",\n", - " title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\",\n", - ")\n", - "fig.update_layout(\n", - " height=700,\n", - " yaxis={\"categoryorder\": \"total descending\"},\n", - " xaxis_title=\"Median Price (£)\",\n", - ")\n", + "fig = px.bar(cheapest.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n", + " color=\"count\", color_continuous_scale=\"Blues\",\n", + " title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\")\n", + "fig.update_layout(height=700, yaxis={\"categoryorder\": \"total descending\"}, xaxis_title=\"Median Price (£)\")\n", "fig.show()" ] }, @@ -589934,19 +589828,14 @@ "source": [ "# Geographic scatter of listings (sample for performance)\n", "sample = clean.sample(n=min(20_000, len(clean)), seed=42)\n", - "fig = px.scatter_map(\n", - " sample.to_pandas(),\n", - " lat=\"latitude\",\n", - " lon=\"longitude\",\n", - " color=\"price\",\n", - " size_max=4,\n", - " color_continuous_scale=\"Viridis\",\n", - " range_color=[100_000, 1_500_000],\n", - " zoom=5,\n", - " center={\"lat\": 52.5, \"lon\": -1.5},\n", - " title=\"Listing Locations (20k sample, colored by price)\",\n", - " opacity=0.4,\n", - ")\n", + "fig = px.scatter_map(sample.to_pandas(),\n", + " lat=\"latitude\", lon=\"longitude\",\n", + " color=\"price\", size_max=4,\n", + " color_continuous_scale=\"Viridis\",\n", + " range_color=[100_000, 1_500_000],\n", + " zoom=5, center={\"lat\": 52.5, \"lon\": -1.5},\n", + " title=\"Listing Locations (20k sample, colored by price)\",\n", + " opacity=0.4)\n", "fig.update_layout(height=700)\n", "fig.show()" ] @@ -589975,9 +589864,7 @@ "source": [ "# Parse dates and look at listing age\n", "with_dates = clean.with_columns(\n", - " pl.col(\"first_visible_date\")\n", - " .str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\")\n", - " .alias(\"listed_at\"),\n", + " pl.col(\"first_visible_date\").str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\").alias(\"listed_at\"),\n", ")\n", "\n", "print(f\"Date range: {with_dates['listed_at'].min()} to {with_dates['listed_at'].max()}\")" @@ -590969,9 +590856,8 @@ " .sort(\"month\")\n", ")\n", "\n", - "fig = px.bar(\n", - " monthly.to_pandas(), x=\"month\", y=\"count\", title=\"Listings by Month Listed\"\n", - ")\n", + "fig = px.bar(monthly.to_pandas(), x=\"month\", y=\"count\",\n", + " title=\"Listings by Month Listed\")\n", "fig.update_layout(height=400, xaxis_title=\"Month\", yaxis_title=\"Listings\")\n", "fig.show()" ] @@ -590998,7 +590884,6 @@ "source": [ "# How old are current listings? (days since first visible)\n", "import datetime\n", - "\n", "now = datetime.datetime(2026, 2, 14)\n", "with_age = with_dates.with_columns(\n", " ((pl.lit(now) - pl.col(\"listed_at\")).dt.total_days()).alias(\"days_on_market\")\n", @@ -591011,7 +590896,7 @@ "print(f\" P25: {age.quantile(0.25):.0f} days\")\n", "print(f\" P75: {age.quantile(0.75):.0f} days\")\n", "print(f\" P95: {age.quantile(0.95):.0f} days\")\n", - "print(f\" Max: {age.max():.0f} days ({age.max() / 365:.1f} years)\")" + "print(f\" Max: {age.max():.0f} days ({age.max()/365:.1f} years)\")" ] }, { @@ -591864,12 +591749,8 @@ "source": [ "# Days on market distribution (cap at 2 years for readability)\n", "capped = with_age.filter(pl.col(\"days_on_market\") <= 730)\n", - "fig = px.histogram(\n", - " capped.to_pandas(),\n", - " x=\"days_on_market\",\n", - " nbins=100,\n", - " title=\"Days on Market Distribution (capped at 2 years)\",\n", - ")\n", + "fig = px.histogram(capped.to_pandas(), x=\"days_on_market\", nbins=100,\n", + " title=\"Days on Market Distribution (capped at 2 years)\")\n", "fig.update_layout(height=400, xaxis_title=\"Days on Market\", yaxis_title=\"Count\")\n", "fig.show()" ] @@ -592002,13 +591883,11 @@ "# Explode features list and count most common\n", "features_exploded = clean.select(\"features\").explode(\"features\").drop_nulls()\n", "print(f\"Total feature entries: {len(features_exploded):,}\")\n", - "print(f\"Features per listing: {len(features_exploded) / len(clean):.1f} avg\")\n", + "print(f\"Features per listing: {len(features_exploded)/len(clean):.1f} avg\")\n", "\n", "# Most common features (lowercased for grouping)\n", "feature_counts = (\n", - " features_exploded.with_columns(\n", - " pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\")\n", - " )\n", + " features_exploded.with_columns(pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\"))\n", " .group_by(\"feature_lower\")\n", " .agg(pl.len().alias(\"count\"))\n", " .sort(\"count\", descending=True)\n", @@ -592915,64 +592794,16 @@ "all_features = features_exploded[\"features\"].to_list()\n", "word_counter = Counter()\n", "for feat in all_features:\n", - " words = re.findall(r\"[a-z]+\", feat.lower())\n", + " words = re.findall(r'[a-z]+', feat.lower())\n", " word_counter.update(words)\n", "\n", "# Filter out very short/common words\n", - "stop_words = {\n", - " \"the\",\n", - " \"a\",\n", - " \"an\",\n", - " \"and\",\n", - " \"or\",\n", - " \"of\",\n", - " \"to\",\n", - " \"in\",\n", - " \"with\",\n", - " \"for\",\n", - " \"on\",\n", - " \"at\",\n", - " \"by\",\n", - " \"is\",\n", - " \"it\",\n", - " \"from\",\n", - " \"as\",\n", - " \"be\",\n", - " \"this\",\n", - " \"that\",\n", - " \"are\",\n", - " \"was\",\n", - " \"has\",\n", - " \"have\",\n", - " \"not\",\n", - " \"but\",\n", - " \"all\",\n", - " \"can\",\n", - " \"had\",\n", - " \"her\",\n", - " \"his\",\n", - " \"one\",\n", - " \"our\",\n", - " \"out\",\n", - " \"you\",\n", - " \"will\",\n", - "}\n", - "keywords = [\n", - " (w, c)\n", - " for w, c in word_counter.most_common(100)\n", - " if w not in stop_words and len(w) > 2\n", - "]\n", - "kw_df = pl.DataFrame(\n", - " {\"word\": [w for w, c in keywords[:40]], \"count\": [c for w, c in keywords[:40]]}\n", - ")\n", + "stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'with', 'for', 'on', 'at', 'by', 'is', 'it', 'from', 'as', 'be', 'this', 'that', 'are', 'was', 'has', 'have', 'not', 'but', 'all', 'can', 'had', 'her', 'his', 'one', 'our', 'out', 'you', 'will'}\n", + "keywords = [(w, c) for w, c in word_counter.most_common(100) if w not in stop_words and len(w) > 2]\n", + "kw_df = pl.DataFrame({\"word\": [w for w,c in keywords[:40]], \"count\": [c for w,c in keywords[:40]]})\n", "\n", - "fig = px.bar(\n", - " kw_df.to_pandas(),\n", - " x=\"count\",\n", - " y=\"word\",\n", - " orientation=\"h\",\n", - " title=\"Most Common Words in Feature Descriptions\",\n", - ")\n", + "fig = px.bar(kw_df.to_pandas(), x=\"count\", y=\"word\", orientation=\"h\",\n", + " title=\"Most Common Words in Feature Descriptions\")\n", "fig.update_layout(height=800, yaxis={\"categoryorder\": \"total ascending\"})\n", "fig.show()" ] @@ -593936,14 +593767,9 @@ " & (pl.col(\"price\") < 3_000_000)\n", ").sample(n=min(15_000, len(with_floor)), seed=42)\n", "\n", - "fig = px.scatter(\n", - " scatter_df.to_pandas(),\n", - " x=\"floorspace_sqm\",\n", - " y=\"price\",\n", - " color=\"property_type\",\n", - " opacity=0.3,\n", - " title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\",\n", - ")\n", + "fig = px.scatter(scatter_df.to_pandas(), x=\"floorspace_sqm\", y=\"price\",\n", + " color=\"property_type\", opacity=0.3,\n", + " title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\")\n", "fig.update_layout(height=600, xaxis_title=\"Floorspace (sqm)\", yaxis_title=\"Price (£)\")\n", "fig.show()" ] @@ -594913,14 +594739,8 @@ " .agg(pl.col(\"price\").median().alias(\"median_price\"), pl.len().alias(\"count\"))\n", " .sort(\"property_type\", \"bedrooms\")\n", ")\n", - "fig = px.line(\n", - " bp.to_pandas(),\n", - " x=\"bedrooms\",\n", - " y=\"median_price\",\n", - " color=\"property_type\",\n", - " markers=True,\n", - " title=\"Median Price by Bedrooms and Property Type\",\n", - ")\n", + "fig = px.line(bp.to_pandas(), x=\"bedrooms\", y=\"median_price\", color=\"property_type\",\n", + " markers=True, title=\"Median Price by Bedrooms and Property Type\")\n", "fig.update_layout(height=450, xaxis_title=\"Bedrooms\", yaxis_title=\"Median Price (£)\")\n", "fig.show()" ] @@ -594969,28 +594789,18 @@ "print(f\"Total listings: {len(clean):,}\")\n", "print(f\"Outcodes covered: {clean['outcode'].n_unique():,}\")\n", "print(\"\")\n", - "print(\n", - " f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\"\n", - ")\n", - "print(\n", - " f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\"\n", - ")\n", + "print(f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\")\n", + "print(f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\")\n", "print(\"\")\n", - "print(\n", - " f\"Tenure known: {(len(clean) - clean['tenure'].null_count()) / len(clean) * 100:.1f}%\"\n", - ")\n", + "print(f\"Tenure known: {(len(clean) - clean['tenure'].null_count())/len(clean)*100:.1f}%\")\n", "print(f\" Freehold: {len(clean.filter(pl.col('tenure') == 'Freehold')):,}\")\n", "print(f\" Leasehold: {len(clean.filter(pl.col('tenure') == 'Leasehold')):,}\")\n", "print(\"\")\n", - "print(\n", - " f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%\"\n", - ")\n", + "print(f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%\")\n", "print(\"\")\n", "print(\"Property types:\")\n", - "for row in (\n", - " clean[\"property_type\"].value_counts().sort(\"count\", descending=True).iter_rows()\n", - "):\n", - " print(f\" {row[0]}: {row[1]:,} ({row[1] / len(clean) * 100:.1f}%)\")" + "for row in clean['property_type'].value_counts().sort('count', descending=True).iter_rows():\n", + " print(f\" {row[0]}: {row[1]:,} ({row[1]/len(clean)*100:.1f}%)\")" ] } ], diff --git a/analyses/source_overlap.ipynb b/analyses/source_overlap.ipynb index 45621b9..399ab3a 100644 --- a/analyses/source_overlap.ipynb +++ b/analyses/source_overlap.ipynb @@ -52,7 +52,6 @@ "buy = pl.read_parquet(f\"{DATA}/online_listings_buy.parquet\")\n", "rent = pl.read_parquet(f\"{DATA}/online_listings_rent.parquet\")\n", "\n", - "\n", "def tag_source(df: pl.DataFrame) -> pl.DataFrame:\n", " return df.with_columns(\n", " pl.when(pl.col(\"Listing URL\").str.contains(\"rightmove\"))\n", @@ -63,7 +62,6 @@ " .alias(\"source\")\n", " )\n", "\n", - "\n", "buy = tag_source(buy)\n", "rent = tag_source(rent)\n", "\n", @@ -124,7 +122,7 @@ " print(f\"\\n=== {label} ===\")\n", " for row in counts.iter_rows():\n", " src, cnt = row\n", - " print(f\" {src}: {cnt:,} ({cnt / len(df) * 100:.1f}%)\")\n", + " print(f\" {src}: {cnt:,} ({cnt/len(df)*100:.1f}%)\")\n", "\n", "# Known dedup count from scraper logs\n", "CROSS_DEDUP_BUY = 2_220\n", @@ -134,7 +132,7 @@ "print(f\"Home.co.uk scraped (before dedup): {hk_buy_total:,}\")\n", "print(f\"Home.co.uk unique (after dedup): {hk_buy_unique:,}\")\n", "print(f\"Cross-source duplicates removed: {CROSS_DEDUP_BUY:,}\")\n", - "print(f\"Overlap rate: {CROSS_DEDUP_BUY / hk_buy_total * 100:.1f}%\")" + "print(f\"Overlap rate: {CROSS_DEDUP_BUY/hk_buy_total*100:.1f}%\")" ] }, { @@ -989,29 +987,23 @@ "# Venn-style summary\n", "rm_buy = len(buy.filter(pl.col(\"source\") == \"Rightmove\"))\n", "\n", - "fig = go.Figure(\n", - " go.Sankey(\n", - " node=dict(\n", - " label=[\n", - " f\"Rightmove\\n{rm_buy:,}\",\n", - " f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n", - " f\"Merged BUY\\n{len(buy):,}\",\n", - " f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n", - " ],\n", - " color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n", - " ),\n", - " link=dict(\n", - " source=[0, 1, 1],\n", - " target=[2, 2, 3],\n", - " value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n", - " color=[\n", - " \"rgba(37,99,235,0.3)\",\n", - " \"rgba(16,185,129,0.3)\",\n", - " \"rgba(239,68,68,0.3)\",\n", - " ],\n", - " ),\n", - " )\n", - ")\n", + "fig = go.Figure(go.Sankey(\n", + " node=dict(\n", + " label=[\n", + " f\"Rightmove\\n{rm_buy:,}\",\n", + " f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n", + " f\"Merged BUY\\n{len(buy):,}\",\n", + " f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n", + " ],\n", + " color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n", + " ),\n", + " link=dict(\n", + " source=[0, 1, 1],\n", + " target=[2, 2, 3],\n", + " value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n", + " color=[\"rgba(37,99,235,0.3)\", \"rgba(16,185,129,0.3)\", \"rgba(239,68,68,0.3)\"],\n", + " ),\n", + "))\n", "fig.update_layout(title=\"BUY Channel: Source Contribution Flow\", height=350)\n", "fig.show()" ] @@ -1114,11 +1106,8 @@ "oc_comparison = (\n", " hk_by_oc.join(rm_by_oc, on=\"outcode\", how=\"left\")\n", " .with_columns(\n", - " (\n", - " pl.col(\"hk_count\")\n", - " / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0))\n", - " * 100\n", - " ).alias(\"hk_pct_of_total\")\n", + " (pl.col(\"hk_count\") / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0)) * 100)\n", + " .alias(\"hk_pct_of_total\")\n", " )\n", " .sort(\"hk_count\", descending=True)\n", ")\n", @@ -2226,28 +2215,18 @@ "source": [ "# Bar chart: home.co.uk vs Rightmove counts per outcode\n", "fig = go.Figure()\n", - "fig.add_trace(\n", - " go.Bar(\n", - " x=oc_comparison[\"outcode\"],\n", - " y=oc_comparison[\"rm_count\"],\n", - " name=\"Rightmove\",\n", - " marker_color=\"#2563eb\",\n", - " )\n", - ")\n", - "fig.add_trace(\n", - " go.Bar(\n", - " x=oc_comparison[\"outcode\"],\n", - " y=oc_comparison[\"hk_count\"],\n", - " name=\"Home.co.uk\",\n", - " marker_color=\"#10b981\",\n", - " )\n", - ")\n", + "fig.add_trace(go.Bar(\n", + " x=oc_comparison[\"outcode\"], y=oc_comparison[\"rm_count\"],\n", + " name=\"Rightmove\", marker_color=\"#2563eb\",\n", + "))\n", + "fig.add_trace(go.Bar(\n", + " x=oc_comparison[\"outcode\"], y=oc_comparison[\"hk_count\"],\n", + " name=\"Home.co.uk\", marker_color=\"#10b981\",\n", + "))\n", "fig.update_layout(\n", - " barmode=\"group\",\n", - " height=400,\n", + " barmode=\"group\", height=400,\n", " title=\"Listings per Outcode: Rightmove vs Home.co.uk (outcodes with HK coverage)\",\n", - " xaxis_title=\"Outcode\",\n", - " yaxis_title=\"Listings\",\n", + " xaxis_title=\"Outcode\", yaxis_title=\"Listings\",\n", ")\n", "fig.show()" ] @@ -3142,14 +3121,10 @@ "sample = covered.sample(n=min(30_000, len(covered)), seed=42)\n", "\n", "fig = px.scatter_map(\n", - " sample.to_pandas(),\n", - " lat=\"lat\",\n", - " lon=\"lon\",\n", + " sample.to_pandas(), lat=\"lat\", lon=\"lon\",\n", " color=\"source\",\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " zoom=7,\n", - " opacity=0.4,\n", - " size_max=4,\n", + " zoom=7, opacity=0.4, size_max=4,\n", " title=\"Listing Locations in Covered Outcodes (by source)\",\n", ")\n", "fig.update_layout(height=600)\n", @@ -3213,41 +3188,15 @@ "# For covered outcodes, compare home.co.uk listings against Rightmove\n", "# to find near-matches (same postcode, same beds, price within 5%)\n", "\n", - "hk = (\n", - " buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\")\n", - " .select(\n", - " \"Postcode\",\n", - " \"Bedrooms\",\n", - " \"Asking price\",\n", - " \"Property type\",\n", - " \"Address per Property Register\",\n", - " )\n", - " .rename(\n", - " {\n", - " \"Asking price\": \"hk_price\",\n", - " \"Property type\": \"hk_type\",\n", - " \"Address per Property Register\": \"hk_addr\",\n", - " }\n", - " )\n", - ")\n", + "hk = buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\").select(\n", + " \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n", + ").rename({\"Asking price\": \"hk_price\", \"Property type\": \"hk_type\", \"Address per Property Register\": \"hk_addr\"})\n", "\n", - "rm = (\n", - " buy_oc.filter(pl.col(\"source\") == \"Rightmove\")\n", - " .select(\n", - " \"Postcode\",\n", - " \"Bedrooms\",\n", - " \"Asking price\",\n", - " \"Property type\",\n", - " \"Address per Property Register\",\n", - " )\n", - " .rename(\n", - " {\n", - " \"Asking price\": \"rm_price\",\n", - " \"Property type\": \"rm_type\",\n", - " \"Address per Property Register\": \"rm_addr\",\n", - " }\n", - " )\n", - ")\n", + "rm = buy_oc.filter(\n", + " pl.col(\"source\") == \"Rightmove\"\n", + ").select(\n", + " \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n", + ").rename({\"Asking price\": \"rm_price\", \"Property type\": \"rm_type\", \"Address per Property Register\": \"rm_addr\"})\n", "\n", "# Join on postcode + bedrooms\n", "joined = hk.join(rm, on=[\"Postcode\", \"Bedrooms\"], how=\"inner\")\n", @@ -3264,24 +3213,16 @@ "exact = joined.filter(pl.col(\"hk_price\") == pl.col(\"rm_price\"))\n", "\n", "print(f\"Home.co.uk listings (unique, in file): {len(hk):,}\")\n", - "print(\n", - " f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\"\n", - ")\n", + "print(f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\")\n", "print()\n", "print(f\"Joined on (postcode, bedrooms): {len(joined):,} candidate pairs\")\n", - "print(\n", - " f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\"\n", - ")\n", - "print(\n", - " f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\"\n", - ")\n", + "print(f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\")\n", + "print(f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\")\n", "print()\n", "# Unique hk listings that have at least one near-match\n", "hk_with_near = near.select(\"hk_price\", \"hk_addr\", \"Postcode\").unique()\n", "print(f\"Home.co.uk listings with a near-match in RM: ~{len(hk_with_near):,}\")\n", - "print(\n", - " f\"Estimated additional overlap: ~{len(hk_with_near) / len(hk) * 100:.1f}% of unique HK listings\"\n", - ")" + "print(f\"Estimated additional overlap: ~{len(hk_with_near)/len(hk)*100:.1f}% of unique HK listings\")" ] }, { @@ -4237,13 +4178,9 @@ ")\n", "\n", "fig = px.histogram(\n", - " clipped.to_pandas(),\n", - " x=\"Asking price\",\n", - " color=\"source\",\n", - " nbins=80,\n", + " clipped.to_pandas(), x=\"Asking price\", color=\"source\", nbins=80,\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " barmode=\"overlay\",\n", - " histnorm=\"probability density\",\n", + " barmode=\"overlay\", histnorm=\"probability density\",\n", " title=\"Price Distribution by Source (normalised, £50k–£2M)\",\n", ")\n", "fig.update_traces(opacity=0.6)\n", @@ -5158,7 +5095,10 @@ ], "source": [ "# Property type distribution by source\n", - "type_by_src = buy.group_by(\"source\", \"Property type\").agg(pl.len().alias(\"count\"))\n", + "type_by_src = (\n", + " buy.group_by(\"source\", \"Property type\")\n", + " .agg(pl.len().alias(\"count\"))\n", + ")\n", "# Normalise within each source\n", "totals = type_by_src.group_by(\"source\").agg(pl.col(\"count\").sum().alias(\"total\"))\n", "type_by_src = type_by_src.join(totals, on=\"source\").with_columns(\n", @@ -5167,10 +5107,7 @@ "\n", "fig = px.bar(\n", " type_by_src.sort(\"Property type\").to_pandas(),\n", - " x=\"Property type\",\n", - " y=\"pct\",\n", - " color=\"source\",\n", - " barmode=\"group\",\n", + " x=\"Property type\", y=\"pct\", color=\"source\", barmode=\"group\",\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", " title=\"Property Type Distribution by Source (%)\",\n", ")\n", @@ -5249,9 +5186,7 @@ "# Property sub-type comparison — top home.co.uk sub-types\n", "hk_subtypes = (\n", " buy.filter(pl.col(\"source\") == \"Home.co.uk\")[\"Property sub-type\"]\n", - " .value_counts()\n", - " .sort(\"count\", descending=True)\n", - " .head(20)\n", + " .value_counts().sort(\"count\", descending=True).head(20)\n", ")\n", "print(\"Top 20 Home.co.uk property sub-types:\")\n", "hk_subtypes" @@ -5328,16 +5263,9 @@ "source": [ "# Field completeness by source\n", "fields = [\n", - " \"Bedrooms\",\n", - " \"Bathrooms\",\n", - " \"Postcode\",\n", - " \"Address per Property Register\",\n", - " \"Leasehold/Freehold\",\n", - " \"Property type\",\n", - " \"Total floor area (sqm)\",\n", - " \"Listing date\",\n", - " \"Asking price\",\n", - " \"Price qualifier\",\n", + " \"Bedrooms\", \"Bathrooms\", \"Postcode\", \"Address per Property Register\",\n", + " \"Leasehold/Freehold\", \"Property type\", \"Total floor area (sqm)\",\n", + " \"Listing date\", \"Asking price\", \"Price qualifier\",\n", "]\n", "\n", "rows = []\n", @@ -5348,19 +5276,17 @@ " non_null = n - subset[f].null_count()\n", " # Also count empty strings as missing for string fields\n", " if subset[f].dtype == pl.Utf8:\n", - " non_null = len(\n", - " subset.filter(pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0))\n", - " )\n", + " non_null = len(subset.filter(\n", + " pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0)\n", + " ))\n", " rows.append({\"source\": src, \"field\": f, \"pct_available\": non_null / n * 100})\n", "\n", "completeness = pl.DataFrame(rows)\n", "pivot = completeness.pivot(on=\"source\", index=\"field\", values=\"pct_available\")\n", - "pivot = pivot.with_columns(\n", - " [\n", - " pl.col(\"Rightmove\").round(1),\n", - " pl.col(\"Home.co.uk\").round(1),\n", - " ]\n", - ")\n", + "pivot = pivot.with_columns([\n", + " pl.col(\"Rightmove\").round(1),\n", + " pl.col(\"Home.co.uk\").round(1),\n", + "])\n", "print(\"Field completeness (% non-null/non-empty):\")\n", "pivot" ] @@ -6272,26 +6198,19 @@ "# Bedroom distribution comparison\n", "fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Rightmove\", \"Home.co.uk\"))\n", "for i, src in enumerate([\"Rightmove\", \"Home.co.uk\"], 1):\n", - " beds = (\n", - " buy.filter((pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8))[\"Bedrooms\"]\n", - " .value_counts()\n", - " .sort(\"Bedrooms\")\n", - " )\n", + " beds = buy.filter(\n", + " (pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8)\n", + " )[\"Bedrooms\"].value_counts().sort(\"Bedrooms\")\n", " # Normalise\n", " total = beds[\"count\"].sum()\n", " fig.add_trace(\n", " go.Bar(\n", - " x=beds[\"Bedrooms\"],\n", - " y=beds[\"count\"] / total * 100,\n", + " x=beds[\"Bedrooms\"], y=beds[\"count\"] / total * 100,\n", " name=src,\n", " marker_color=\"#2563eb\" if src == \"Rightmove\" else \"#10b981\",\n", - " ),\n", - " row=1,\n", - " col=i,\n", + " ), row=1, col=i,\n", " )\n", - "fig.update_layout(\n", - " height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False\n", - ")\n", + "fig.update_layout(height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False)\n", "fig.update_yaxes(title_text=\"%\", row=1, col=1)\n", "fig.show()" ] @@ -6368,23 +6287,17 @@ "\n", "comparison_rows = []\n", "for ptype in [\"Detached\", \"Semi-Detached\", \"Terraced\", \"Flats/Maisonettes\", \"Other\"]:\n", - " rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\n", - " \"Asking price\"\n", - " ].drop_nulls()\n", + " rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n", " hk_p = hk_only.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n", " if len(rm_p) > 0 and len(hk_p) > 0:\n", - " comparison_rows.append(\n", - " {\n", - " \"Property type\": ptype,\n", - " \"RM count\": len(rm_p),\n", - " \"RM median £\": int(rm_p.median()),\n", - " \"HK count\": len(hk_p),\n", - " \"HK median £\": int(hk_p.median()),\n", - " \"HK premium %\": round(\n", - " (hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1\n", - " ),\n", - " }\n", - " )\n", + " comparison_rows.append({\n", + " \"Property type\": ptype,\n", + " \"RM count\": len(rm_p),\n", + " \"RM median £\": int(rm_p.median()),\n", + " \"HK count\": len(hk_p),\n", + " \"HK median £\": int(hk_p.median()),\n", + " \"HK premium %\": round((hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1),\n", + " })\n", "\n", "comp = pl.DataFrame(comparison_rows)\n", "print(\"Price comparison in covered outcodes (Home.co.uk unique listings vs Rightmove):\")\n", @@ -7332,13 +7245,9 @@ "# Listing age histogram comparison\n", "age_plot = with_age.filter(pl.col(\"days_on_market\") <= 730) # cap at 2 years\n", "fig = px.histogram(\n", - " age_plot.to_pandas(),\n", - " x=\"days_on_market\",\n", - " color=\"source\",\n", - " nbins=60,\n", + " age_plot.to_pandas(), x=\"days_on_market\", color=\"source\", nbins=60,\n", " color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n", - " barmode=\"overlay\",\n", - " histnorm=\"probability density\",\n", + " barmode=\"overlay\", histnorm=\"probability density\",\n", " title=\"Days on Market Distribution by Source (normalised, capped at 2 years)\",\n", ")\n", "fig.update_traces(opacity=0.6)\n", @@ -7421,9 +7330,7 @@ "print(f\" Projected home.co.uk total: ~{projected_hk:,}\")\n", "print(f\" Projected cross-dedup: ~{projected_dedup:,}\")\n", "print(f\" Projected unique additions: ~{projected_unique:,}\")\n", - "print(\n", - " f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique / rm_buy * 100:.1f}% increase)\"\n", - ")\n", + "print(f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique/rm_buy*100:.1f}% increase)\")\n", "print()\n", "print(\"⚠️ These are rough estimates — the covered outcodes may not be representative\")" ] diff --git a/analyses/travel_time_comparison.ipynb b/analyses/travel_time_comparison.ipynb index 1b2ead8..7fae5a2 100644 --- a/analyses/travel_time_comparison.ipynb +++ b/analyses/travel_time_comparison.ipynb @@ -54,15 +54,11 @@ } ], "source": [ - "r5_bank = pl.read_parquet(\n", - " \"../property-data/travel-times/transit/000000-bank-tube-station.parquet\"\n", - ")\n", + "r5_bank = pl.read_parquet(\"../property-data/travel-times/transit/000000-bank-tube-station.parquet\")\n", "manual_bank = pl.read_parquet(\"../manual-data/journey_times_bank.parquet\")\n", "\n", "print(f\"R5 Bank: {r5_bank.shape[0]:,} postcodes\")\n", - "print(\n", - " f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\"\n", - ")" + "print(f\"Manual Bank: {manual_bank.shape[0]:,} postcodes ({manual_bank['public_transport_easy_minutes'].null_count():,} null easy)\")" ] }, { @@ -120,49 +116,25 @@ "source": [ "# Join on postcode, keep only rows where both sources have values\n", "bank = (\n", - " r5_bank.join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n", + " r5_bank\n", + " .join(manual_bank, left_on=\"pcds\", right_on=\"postcode\", how=\"inner\")\n", " .filter(\n", " pl.col(\"public_transport_easy_minutes\").is_not_null()\n", " & pl.col(\"public_transport_quick_minutes\").is_not_null()\n", " )\n", - " .with_columns(\n", - " [\n", - " # Signed error: R5 - Manual (positive = R5 is slower)\n", - " (\n", - " pl.col(\"travel_minutes\").cast(pl.Float64)\n", - " - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n", - " ).alias(\"error_easy\"),\n", - " (\n", - " pl.col(\"best_minutes\").cast(pl.Float64)\n", - " - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n", - " ).alias(\"error_quick\"),\n", - " # Absolute error\n", - " (\n", - " pl.col(\"travel_minutes\").cast(pl.Float64)\n", - " - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)\n", - " )\n", - " .abs()\n", - " .alias(\"abs_error_easy\"),\n", - " (\n", - " pl.col(\"best_minutes\").cast(pl.Float64)\n", - " - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)\n", - " )\n", - " .abs()\n", - " .alias(\"abs_error_quick\"),\n", - " ]\n", - " )\n", + " .with_columns([\n", + " # Signed error: R5 - Manual (positive = R5 is slower)\n", + " (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).alias(\"error_easy\"),\n", + " (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).alias(\"error_quick\"),\n", + " # Absolute error\n", + " (pl.col(\"travel_minutes\").cast(pl.Float64) - pl.col(\"public_transport_easy_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_easy\"),\n", + " (pl.col(\"best_minutes\").cast(pl.Float64) - pl.col(\"public_transport_quick_minutes\").cast(pl.Float64)).abs().alias(\"abs_error_quick\"),\n", + " ])\n", ")\n", "\n", "print(f\"Joined (non-null): {bank.shape[0]:,} postcodes\")\n", - "bank.select(\n", - " \"pcds\",\n", - " \"travel_minutes\",\n", - " \"public_transport_easy_minutes\",\n", - " \"error_easy\",\n", - " \"best_minutes\",\n", - " \"public_transport_quick_minutes\",\n", - " \"error_quick\",\n", - ").head(10)" + "bank.select(\"pcds\", \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n", + " \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\").head(10)" ] }, { @@ -224,23 +196,18 @@ " percentiles = [5, 25, 50, 80, 90, 95, 99]\n", " rows = []\n", " for p in percentiles:\n", - " rows.append(\n", - " {\n", - " \"percentile\": f\"p{p}\",\n", - " f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n", - " f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n", - " }\n", - " )\n", - " rows.append(\n", - " {\n", - " \"percentile\": \"mean\",\n", - " f\"{label} signed error\": round(float(np.mean(col)), 1),\n", - " f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n", - " }\n", - " )\n", + " rows.append({\n", + " \"percentile\": f\"p{p}\",\n", + " f\"{label} signed error\": round(float(np.percentile(col, p)), 1),\n", + " f\"{label} absolute error\": round(float(np.percentile(abs_col, p)), 1),\n", + " })\n", + " rows.append({\n", + " \"percentile\": \"mean\",\n", + " f\"{label} signed error\": round(float(np.mean(col)), 1),\n", + " f\"{label} absolute error\": round(float(np.mean(abs_col)), 1),\n", + " })\n", " return pl.DataFrame(rows)\n", "\n", - "\n", "stats_easy = percentile_stats(\"error_easy\", \"Median (easy)\")\n", "stats_quick = percentile_stats(\"error_quick\", \"Best (quick)\")\n", "\n", @@ -1153,42 +1120,24 @@ } ], "source": [ - "fig = make_subplots(\n", - " rows=1,\n", - " cols=2,\n", - " subplot_titles=[\n", - " \"Median transit time error (R5 − TfL)\",\n", - " \"Best transit time error (R5 − TfL)\",\n", - " ],\n", - ")\n", + "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", + " \"Median transit time error (R5 − TfL)\",\n", + " \"Best transit time error (R5 − TfL)\"\n", + "])\n", "\n", "# Clip for readability\n", "easy_clipped = bank[\"error_easy\"].clip(-60, 60).to_numpy()\n", "quick_clipped = bank[\"error_quick\"].clip(-60, 60).to_numpy()\n", "\n", - "fig.add_trace(\n", - " go.Histogram(\n", - " x=easy_clipped, nbinsx=120, name=\"Median (easy)\", marker_color=\"#0d9488\"\n", - " ),\n", - " row=1,\n", - " col=1,\n", - ")\n", - "fig.add_trace(\n", - " go.Histogram(\n", - " x=quick_clipped, nbinsx=120, name=\"Best (quick)\", marker_color=\"#f59e0b\"\n", - " ),\n", - " row=1,\n", - " col=2,\n", - ")\n", + "fig.add_trace(go.Histogram(x=easy_clipped, nbinsx=120, name=\"Median (easy)\",\n", + " marker_color=\"#0d9488\"), row=1, col=1)\n", + "fig.add_trace(go.Histogram(x=quick_clipped, nbinsx=120, name=\"Best (quick)\",\n", + " marker_color=\"#f59e0b\"), row=1, col=2)\n", "\n", "fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=1)\n", "fig.update_xaxes(title_text=\"Error (minutes)\", row=1, col=2)\n", "fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", - "fig.update_layout(\n", - " height=400,\n", - " showlegend=False,\n", - " title_text=\"Bank: Error Distribution (clipped ±60 min)\",\n", - ")\n", + "fig.update_layout(height=400, showlegend=False, title_text=\"Bank: Error Distribution (clipped ±60 min)\")\n", "fig.show()" ] }, @@ -2155,55 +2104,34 @@ "# Sample for scatter plot performance\n", "sample = bank.sample(n=min(20_000, bank.shape[0]), seed=42)\n", "\n", - "fig = make_subplots(\n", - " rows=1,\n", - " cols=2,\n", - " subplot_titles=[\"Median: R5 vs TfL (easy)\", \"Best: R5 vs TfL (quick)\"],\n", - ")\n", + "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", + " \"Median: R5 vs TfL (easy)\",\n", + " \"Best: R5 vs TfL (quick)\"\n", + "])\n", "\n", - "fig.add_trace(\n", - " go.Scattergl(\n", - " x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n", - " y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n", - " mode=\"markers\",\n", - " marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n", - " name=\"Median\",\n", - " ),\n", - " row=1,\n", - " col=1,\n", - ")\n", + "fig.add_trace(go.Scattergl(\n", + " x=sample[\"public_transport_easy_minutes\"].to_numpy(),\n", + " y=sample[\"travel_minutes\"].cast(pl.Float64).to_numpy(),\n", + " mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#0d9488\"),\n", + " name=\"Median\"\n", + "), row=1, col=1)\n", "\n", - "fig.add_trace(\n", - " go.Scattergl(\n", - " x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n", - " y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n", - " mode=\"markers\",\n", - " marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n", - " name=\"Best\",\n", - " ),\n", - " row=1,\n", - " col=2,\n", - ")\n", + "fig.add_trace(go.Scattergl(\n", + " x=sample[\"public_transport_quick_minutes\"].to_numpy(),\n", + " y=sample[\"best_minutes\"].cast(pl.Float64).to_numpy(),\n", + " mode=\"markers\", marker=dict(size=2, opacity=0.3, color=\"#f59e0b\"),\n", + " name=\"Best\"\n", + "), row=1, col=2)\n", "\n", "# Perfect agreement line\n", "for col in [1, 2]:\n", - " fig.add_trace(\n", - " go.Scatter(\n", - " x=[0, 200],\n", - " y=[0, 200],\n", - " mode=\"lines\",\n", - " line=dict(color=\"red\", dash=\"dash\", width=1),\n", - " showlegend=False,\n", - " ),\n", - " row=1,\n", - " col=col,\n", - " )\n", + " fig.add_trace(go.Scatter(x=[0, 200], y=[0, 200], mode=\"lines\",\n", + " line=dict(color=\"red\", dash=\"dash\", width=1),\n", + " showlegend=False), row=1, col=col)\n", " fig.update_xaxes(title_text=\"TfL API (minutes)\", row=1, col=col)\n", " fig.update_yaxes(title_text=\"R5 (minutes)\", row=1, col=col)\n", "\n", - "fig.update_layout(\n", - " height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\"\n", - ")\n", + "fig.update_layout(height=500, showlegend=False, title_text=\"Bank: R5 vs TfL API (20k sample)\")\n", "fig.show()" ] }, @@ -403135,8 +403063,7 @@ "\n", "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\",\n", - " lon=\"long\",\n", + " lat=\"lat\", lon=\"long\",\n", " color=\"error_easy\",\n", " color_continuous_scale=\"RdBu_r\", # red=positive (R5 slower), blue=negative (R5 faster)\n", " range_color=[-30, 30],\n", @@ -403144,14 +403071,8 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Median transit error (R5 − TfL easy), minutes\",\n", - " hover_data={\n", - " \"pcds\": True,\n", - " \"travel_minutes\": True,\n", - " \"public_transport_easy_minutes\": True,\n", - " \"error_easy\": \":.0f\",\n", - " \"lat\": False,\n", - " \"long\": False,\n", - " },\n", + " hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n", + " \"error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -804073,8 +803994,7 @@ "source": [ "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\",\n", - " lon=\"long\",\n", + " lat=\"lat\", lon=\"long\",\n", " color=\"error_quick\",\n", " color_continuous_scale=\"RdBu_r\",\n", " range_color=[-30, 30],\n", @@ -804082,14 +804002,8 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Best transit error (R5 − TfL quick), minutes\",\n", - " hover_data={\n", - " \"pcds\": True,\n", - " \"best_minutes\": True,\n", - " \"public_transport_quick_minutes\": True,\n", - " \"error_quick\": \":.0f\",\n", - " \"lat\": False,\n", - " \"long\": False,\n", - " },\n", + " hover_data={\"pcds\": True, \"best_minutes\": True, \"public_transport_quick_minutes\": True,\n", + " \"error_quick\": \":.0f\", \"lat\": False, \"long\": False},\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -1205011,8 +1204925,7 @@ "source": [ "fig = px.scatter_map(\n", " map_sample.to_pandas(),\n", - " lat=\"lat\",\n", - " lon=\"long\",\n", + " lat=\"lat\", lon=\"long\",\n", " color=\"abs_error_easy\",\n", " color_continuous_scale=\"YlOrRd\",\n", " range_color=[0, 30],\n", @@ -1205020,14 +1204933,8 @@ " center={\"lat\": 51.5, \"lon\": -0.1},\n", " opacity=0.5,\n", " title=\"Bank — Absolute median transit error |R5 − TfL easy|, minutes\",\n", - " hover_data={\n", - " \"pcds\": True,\n", - " \"travel_minutes\": True,\n", - " \"public_transport_easy_minutes\": True,\n", - " \"abs_error_easy\": \":.0f\",\n", - " \"lat\": False,\n", - " \"long\": False,\n", - " },\n", + " hover_data={\"pcds\": True, \"travel_minutes\": True, \"public_transport_easy_minutes\": True,\n", + " \"abs_error_easy\": \":.0f\", \"lat\": False, \"long\": False},\n", " height=700,\n", ")\n", "fig.update_layout(map_style=\"carto-positron\")\n", @@ -1205091,15 +1204998,9 @@ ], "source": [ "bank.sort(\"abs_error_easy\", descending=True).select(\n", - " \"pcds\",\n", - " \"lat\",\n", - " \"long\",\n", - " \"travel_minutes\",\n", - " \"public_transport_easy_minutes\",\n", - " \"error_easy\",\n", - " \"best_minutes\",\n", - " \"public_transport_quick_minutes\",\n", - " \"error_quick\",\n", + " \"pcds\", \"lat\", \"long\",\n", + " \"travel_minutes\", \"public_transport_easy_minutes\", \"error_easy\",\n", + " \"best_minutes\", \"public_transport_quick_minutes\", \"error_quick\",\n", ").head(30)" ] }, @@ -1206044,75 +1205945,45 @@ "\n", "dist_df = bank.with_columns(\n", " # Rough km distance using Haversine approximation\n", - " (\n", - " (\n", - " ((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2\n", - " + ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2\n", - " )\n", - " ** 0.5\n", + " ((((pl.col(\"lat\") - BANK_LAT) * 111.32) ** 2 +\n", + " ((pl.col(\"long\") - BANK_LON) * 111.32 * np.cos(np.radians(BANK_LAT))) ** 2) ** 0.5\n", " ).alias(\"dist_km\")\n", ")\n", "\n", "# Bin by 5km\n", "binned = (\n", - " dist_df.with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n", + " dist_df\n", + " .with_columns((pl.col(\"dist_km\") / 5).floor() * 5)\n", " .group_by(\"dist_km\")\n", - " .agg(\n", - " [\n", - " pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n", - " pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n", - " pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n", - " pl.len().alias(\"count\"),\n", - " ]\n", - " )\n", + " .agg([\n", + " pl.col(\"error_easy\").median().alias(\"median_error_easy\"),\n", + " pl.col(\"error_quick\").median().alias(\"median_error_quick\"),\n", + " pl.col(\"abs_error_easy\").median().alias(\"median_abs_error_easy\"),\n", + " pl.len().alias(\"count\"),\n", + " ])\n", " .sort(\"dist_km\")\n", " .filter(pl.col(\"count\") > 50)\n", ")\n", "\n", - "fig = make_subplots(\n", - " rows=1,\n", - " cols=2,\n", - " subplot_titles=[\n", - " \"Median signed error by distance\",\n", - " \"Median absolute error by distance\",\n", - " ],\n", - ")\n", + "fig = make_subplots(rows=1, cols=2, subplot_titles=[\n", + " \"Median signed error by distance\",\n", + " \"Median absolute error by distance\"\n", + "])\n", "\n", - "fig.add_trace(\n", - " go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(),\n", - " y=binned[\"median_error_easy\"].to_numpy(),\n", - " mode=\"lines+markers\",\n", - " name=\"Easy\",\n", - " line=dict(color=\"#0d9488\"),\n", - " ),\n", - " row=1,\n", - " col=1,\n", - ")\n", - "fig.add_trace(\n", - " go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(),\n", - " y=binned[\"median_error_quick\"].to_numpy(),\n", - " mode=\"lines+markers\",\n", - " name=\"Quick\",\n", - " line=dict(color=\"#f59e0b\"),\n", - " ),\n", - " row=1,\n", - " col=1,\n", - ")\n", + "fig.add_trace(go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_easy\"].to_numpy(),\n", + " mode=\"lines+markers\", name=\"Easy\", line=dict(color=\"#0d9488\")\n", + "), row=1, col=1)\n", + "fig.add_trace(go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_error_quick\"].to_numpy(),\n", + " mode=\"lines+markers\", name=\"Quick\", line=dict(color=\"#f59e0b\")\n", + "), row=1, col=1)\n", "\n", - "fig.add_trace(\n", - " go.Scatter(\n", - " x=binned[\"dist_km\"].to_numpy(),\n", - " y=binned[\"median_abs_error_easy\"].to_numpy(),\n", - " mode=\"lines+markers\",\n", - " name=\"|Easy|\",\n", - " line=dict(color=\"#0d9488\"),\n", - " showlegend=False,\n", - " ),\n", - " row=1,\n", - " col=2,\n", - ")\n", + "fig.add_trace(go.Scatter(\n", + " x=binned[\"dist_km\"].to_numpy(), y=binned[\"median_abs_error_easy\"].to_numpy(),\n", + " mode=\"lines+markers\", name=\"|Easy|\", line=dict(color=\"#0d9488\"),\n", + " showlegend=False\n", + "), row=1, col=2)\n", "\n", "for col in [1, 2]:\n", " fig.update_xaxes(title_text=\"Distance from Bank (km)\", row=1, col=col)\n", diff --git a/docker-compose.yml b/docker-compose.yml index 0b0f525..c8126ae 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -146,12 +146,6 @@ services: # networks: # - dev-network # restart: unless-stopped - # healthcheck: - # test: ["CMD", "curl", "-f", "http://localhost:8191/health"] - # interval: 30s - # timeout: 5s - # retries: 3 - # start_period: 30s # finder: # build: @@ -167,14 +161,8 @@ services: # gluetun: # condition: service_healthy # flaresolverr: - # condition: service_healthy + # condition: service_started # restart: unless-stopped - # healthcheck: - # test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"] - # interval: 30s - # timeout: 5s - # retries: 3 - # start_period: 60s volumes: diff --git a/finder/Dockerfile b/finder/Dockerfile index 00c0344..c975550 100644 --- a/finder/Dockerfile +++ b/finder/Dockerfile @@ -5,14 +5,9 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv WORKDIR /app COPY pyproject.toml ./ RUN uv pip install --system -r pyproject.toml -RUN playwright install-deps firefox -RUN camoufox fetch \ - && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)" +RUN playwright install --with-deps chromium COPY *.py ./ COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet -HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ - CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')" - CMD ["python3", "main.py"] diff --git a/finder/constants.py b/finder/constants.py index 2985486..b42a961 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -4,8 +4,8 @@ from pathlib import Path ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet") DATA_DIR = Path("/app/data") PAGE_SIZE = 24 -DELAY_BETWEEN_PAGES = 0.5 -DELAY_BETWEEN_OUTCODES = 1.0 +DELAY_BETWEEN_PAGES = 1.0 +DELAY_BETWEEN_OUTCODES = 2.0 MAX_RETRIES = 3 RETRY_BASE_DELAY = 2.0 GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index @@ -16,29 +16,9 @@ SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3")) # Whether to run a scrape immediately on startup RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes") # Enable/disable individual sources -SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ( - "1", - "true", - "yes", -) -SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ( - "1", - "true", - "yes", -) -SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ( - "1", - "true", - "yes", -) -SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in ( - "1", - "true", - "yes", -) - -# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload) -RELOAD_URL = os.environ.get("RELOAD_URL", "") +SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ("1", "true", "yes") +SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ("1", "true", "yes") +SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ("1", "true", "yes") TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" @@ -52,9 +32,6 @@ HOMECOUK_PER_PAGE = 30 # max supported by the API # OpenRent OPENRENT_BASE = "https://www.openrent.co.uk" -# Zoopla -ZOOPLA_BASE = "https://www.zoopla.co.uk" - PROPERTY_TYPE_MAP = { "Detached": "Detached", "Semi-Detached": "Semi-Detached", @@ -67,7 +44,6 @@ PROPERTY_TYPE_MAP = { "Apartment": "Flats/Maisonettes", "Penthouse": "Flats/Maisonettes", "Ground Flat": "Flats/Maisonettes", - "Duplex": "Flats/Maisonettes", "Detached Bungalow": "Detached", "Semi-Detached Bungalow": "Semi-Detached", "Town House": "Terraced", @@ -76,15 +52,9 @@ PROPERTY_TYPE_MAP = { "Bungalow": "Other", "Cottage": "Other", "Park Home": "Other", - "Mobile Home": "Other", - "Caravan": "Other", - "Lodge": "Other", "Land": "Other", "Farm / Barn": "Other", - "Farm House": "Other", "House": "Detached", - "House of Multiple Occupation": "Flats/Maisonettes", - "House Share": "Other", "Not Specified": "Other", "Chalet": "Other", "Barn Conversion": "Other", @@ -92,20 +62,9 @@ PROPERTY_TYPE_MAP = { "Character Property": "Other", "Cluster House": "Other", "Retirement Property": "Flats/Maisonettes", - "Parking": "Other", "Plot": "Other", "Garages": "Other", "Mews": "Terraced", - "Property": "Other", - # Lowercase variants (from home.co.uk / Rightmove APIs) - "house": "Detached", - "bungalow": "Other", - "townhouse": "Terraced", - "land": "Other", - "other": "Other", - "not-specified": "Other", - "retirement-property": "Flats/Maisonettes", - "equestrian-facility": "Other", } CHANNELS = [ diff --git a/finder/homecouk.py b/finder/homecouk.py index fc18fdf..f6005fd 100644 --- a/finder/homecouk.py +++ b/finder/homecouk.py @@ -86,8 +86,7 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None: log.info( "Cloudflare solved — got %d cookies, UA: %s", - len(cookies), - user_agent[:60], + len(cookies), user_agent[:60], ) flaresolverr_attempts_total.labels(result="success").inc() return cookies, user_agent @@ -130,13 +129,11 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session: Uses Chrome TLS impersonation so cf_clearance cookies (which are bound to Chrome's JA3 fingerprint from FlareSolverr) remain valid.""" session = Session(impersonate="chrome") - session.headers.update( - { - "User-Agent": user_agent, - "Accept": "application/json, text/plain, */*", - "x-requested-with": "XMLHttpRequest", - } - ) + session.headers.update({ + "User-Agent": user_agent, + "Accept": "application/json, text/plain, */*", + "x-requested-with": "XMLHttpRequest", + }) # Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the # X-XSRF-TOKEN request header (URL-decoded). Without this header, the # server rejects every request with 419/403. @@ -168,11 +165,7 @@ def fetch_page( return resp.json() except json.JSONDecodeError: homecouk_errors_total.labels(type="json_decode").inc() - log.error( - "Non-JSON response from %s (got %s)", - url, - resp.headers.get("content-type", "?"), - ) + log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?")) return None if resp.status_code == 403: raise CookiesExpiredError("HTTP 403 — cookies likely expired") @@ -180,11 +173,7 @@ def fetch_page( delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, - url, - attempt + 1, - max_retries, - delay, + resp.status_code, url, attempt + 1, max_retries, delay, ) time.sleep(delay) continue @@ -197,11 +186,7 @@ def fetch_page( delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, - url, - attempt + 1, - max_retries, - delay, + type(e).__name__, url, attempt + 1, max_retries, delay, ) time.sleep(delay) homecouk_errors_total.labels(type="retry_exhausted").inc() @@ -233,12 +218,7 @@ def map_property_type(raw_type: str | None) -> str: # Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc. # Try common patterns lower = raw_type.lower() - if ( - "flat" in lower - or "apartment" in lower - or "maisonette" in lower - or "studio" in lower - ): + if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: return "Flats/Maisonettes" if "detached" in lower and "semi" not in lower: return "Detached" @@ -251,9 +231,7 @@ def map_property_type(raw_type: str | None) -> str: def transform_property( - prop: dict, - channel: str, - pc_index: PostcodeSpatialIndex, + prop: dict, channel: str, pc_index: PostcodeSpatialIndex, ) -> dict | None: """Transform a raw home.co.uk property dict into our output schema.""" lat = prop.get("latitude") diff --git a/finder/http_client.py b/finder/http_client.py index 64be33b..ecc993f 100644 --- a/finder/http_client.py +++ b/finder/http_client.py @@ -11,9 +11,7 @@ from metrics import http_errors_total, http_requests_total, ip_rotations_total log = logging.getLogger("rightmove") -_ua = UserAgent( - browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0 -) +_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0) def _endpoint_label(url: str) -> str: @@ -29,7 +27,6 @@ def _status_label(code: int) -> str: return "5xx" return str(code) - # Gluetun control API — runs on port 8000 inside the gluetun container. # Since finder uses network_mode: service:gluetun, localhost IS gluetun. GLUETUN_API = "http://127.0.0.1:8000" @@ -45,25 +42,17 @@ def rotate_ip() -> bool: # Get current IP with httpx.Client(timeout=10) as ctl: old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip") - old_ip = ( - old_ip_resp.json().get("public_ip", "unknown") - if old_ip_resp.status_code == 200 - else "unknown" - ) + old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown" log.info("Current IP: %s", old_ip) # Trigger server change — PUT with empty JSON body picks a random server - resp = ctl.put( - f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"} - ) + resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}) if resp.status_code != 200: log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text) return False time.sleep(2) - resp = ctl.put( - f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"} - ) + resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}) if resp.status_code != 200: log.error("Failed to start VPN: %d %s", resp.status_code, resp.text) return False @@ -110,9 +99,7 @@ def fetch_with_retry( for attempt in range(MAX_RETRIES): try: resp = client.get(url, params=params) - http_requests_total.labels( - status=_status_label(resp.status_code), endpoint=endpoint - ).inc() + http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc() if resp.status_code == 200: return resp.json() if resp.status_code == 403 and on_403: @@ -124,34 +111,15 @@ def fetch_with_retry( return None if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning( - "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, - url, - attempt + 1, - MAX_RETRIES, - delay, - ) + log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None - except ( - httpx.ConnectError, - httpx.ReadTimeout, - httpx.WriteTimeout, - httpx.PoolTimeout, - ) as e: + except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e: http_errors_total.labels(type=type(e).__name__).inc() delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning( - "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, - url, - attempt + 1, - MAX_RETRIES, - delay, - ) + log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay) time.sleep(delay) http_errors_total.labels(type="retry_exhausted").inc() log.error("All %d retries exhausted for %s", MAX_RETRIES, url) diff --git a/finder/main.py b/finder/main.py index b68f824..666033b 100644 --- a/finder/main.py +++ b/finder/main.py @@ -7,15 +7,7 @@ from pathlib import Path from flask import Flask, Response, jsonify, send_from_directory from prometheus_client import generate_latest, CONTENT_TYPE_LATEST -from constants import ( - DATA_DIR, - RUN_ON_STARTUP, - SCHEDULE_HOUR, - SCRAPE_HOMECOUK, - SCRAPE_OPENRENT, - SCRAPE_RIGHTMOVE, - SCRAPE_ZOOPLA, -) +from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE from homecouk import load_cookies as load_homecouk_cookies from openrent import load_cookies as load_openrent_cookies from rightmove import outcode_cache @@ -49,16 +41,6 @@ log.setLevel(logging.DEBUG) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) - -# Suppress noisy /metrics and /health request logs from werkzeug -class _NoiseFilter(logging.Filter): - def filter(self, record): - msg = record.getMessage() - return "GET /metrics" not in msg and "GET /health" not in msg - - -logging.getLogger("werkzeug").addFilter(_NoiseFilter()) - # --------------------------------------------------------------------------- # Startup: load data # --------------------------------------------------------------------------- @@ -66,15 +48,9 @@ logging.getLogger("werkzeug").addFilter(_NoiseFilter()) log.info("Loading arcgis data...") OUTCODES = load_outcodes() PC_INDEX = build_postcode_index() -PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None -log.info( - "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)", - len(OUTCODES), - SCRAPE_RIGHTMOVE, - SCRAPE_HOMECOUK, - SCRAPE_OPENRENT, - SCRAPE_ZOOPLA, -) +PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None +log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)", + len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT) # --------------------------------------------------------------------------- # Scheduler @@ -87,9 +63,7 @@ def _start_scrape() -> bool: if status.state == "running": return False status.state = "running" - thread = threading.Thread( - target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True - ) + thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True) thread.start() return True @@ -108,9 +82,7 @@ def _scheduler_loop() -> None: log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR) while True: wait = _seconds_until(SCHEDULE_HOUR) - log.info( - "Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600 - ) + log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600) time.sleep(wait) log.info("Scheduled scrape triggered") if not _start_scrape(): @@ -133,11 +105,6 @@ if SCHEDULE_HOUR >= 0: app = Flask(__name__) -@app.route("/health") -def health(): - return "ok", 200 - - @app.route("/run", methods=["POST"]) def trigger_run(): if _start_scrape(): @@ -164,7 +131,6 @@ def get_status(): "rightmove": status.rm_properties, "homecouk": status.hk_properties, "openrent": status.or_properties, - "zoopla": status.zp_properties, }, "errors": status.errors[-20:], # last 20 errors "elapsed_seconds": round(elapsed, 1), @@ -178,19 +144,15 @@ def get_status(): def get_debug(): hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None - return jsonify( - { - "outcode_cache_size": len(outcode_cache), - "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), - "scrape_rightmove": SCRAPE_RIGHTMOVE, - "scrape_homecouk": SCRAPE_HOMECOUK, - "scrape_openrent": SCRAPE_OPENRENT, - "scrape_zoopla": SCRAPE_ZOOPLA, - "homecouk_cookies_available": hk_cookies is not None, - "openrent_cookies_available": or_cookies is not None, - "zoopla_note": "browser-based (Camoufox), no cookies needed", - } - ) + return jsonify({ + "outcode_cache_size": len(outcode_cache), + "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), + "scrape_rightmove": SCRAPE_RIGHTMOVE, + "scrape_homecouk": SCRAPE_HOMECOUK, + "scrape_openrent": SCRAPE_OPENRENT, + "homecouk_cookies_available": hk_cookies is not None, + "openrent_cookies_available": or_cookies is not None, + }) @app.route("/metrics") diff --git a/finder/metrics.py b/finder/metrics.py index df8ae26..134cc7f 100644 --- a/finder/metrics.py +++ b/finder/metrics.py @@ -109,28 +109,6 @@ openrent_properties_scraped = Counter( ["channel"], ) -# --------------------------------------------------------------------------- -# Counters — Zoopla -# --------------------------------------------------------------------------- - -zoopla_pages_scraped = Counter( - "zoopla_pages_scraped", - "Search result pages scraped from Zoopla", - ["channel"], -) - -zoopla_errors_total = Counter( - "zoopla_errors_total", - "Zoopla scraping errors", - ["type"], -) - -zoopla_properties_scraped = Counter( - "zoopla_properties_scraped", - "Properties scraped from Zoopla (before dedup)", - ["channel"], -) - # --------------------------------------------------------------------------- # Counters — FlareSolverr / cookie management # --------------------------------------------------------------------------- @@ -160,8 +138,3 @@ openrent_enabled = Gauge( "openrent_enabled", "Whether OpenRent scraping is currently active (1=yes, 0=no)", ) - -zoopla_enabled = Gauge( - "zoopla_enabled", - "Whether Zoopla scraping is currently active (1=yes, 0=no)", -) diff --git a/finder/openrent.py b/finder/openrent.py index c96dd44..d66d0f6 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -79,8 +79,7 @@ def solve_waf() -> tuple[dict[str, str], str] | None: if "AwsWafIntegration" in content: log.info("Got WAF challenge page, waiting for resolution...") page.wait_for_selector( - "a.pli, .pli, .search-property-card", - timeout=30000, + "a.pli, .pli, .search-property-card", timeout=30000, ) raw_cookies = context.cookies() @@ -95,8 +94,7 @@ def solve_waf() -> tuple[dict[str, str], str] | None: log.info( "AWS WAF solved — got %d cookies, UA: %s", - len(cookies), - user_agent[:60], + len(cookies), user_agent[:60], ) flaresolverr_attempts_total.labels(result="success").inc() return cookies, user_agent @@ -132,13 +130,11 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session: """Create a curl_cffi Session configured for OpenRent. Uses Chrome TLS impersonation so AWS WAF cookies remain valid.""" session = Session(impersonate="chrome") - session.headers.update( - { - "User-Agent": user_agent, - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-GB,en;q=0.9", - } - ) + session.headers.update({ + "User-Agent": user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + }) for name, value in cookies.items(): session.cookies.set(name, value, domain="openrent.co.uk") return session @@ -156,9 +152,7 @@ def _status_label(code: int) -> str: def fetch_page( - client: Session, - url: str, - max_retries: int = 3, + client: Session, url: str, max_retries: int = 3, ) -> str | None: """GET HTML with retries on 429/5xx. Returns None on permanent failure. WAF challenge (202 or 403 with challenge JS) raises WafChallengeError.""" @@ -171,25 +165,17 @@ def fetch_page( html = resp.text # Detect WAF challenge page masquerading as 200 if "AwsWafIntegration" in html and "challenge.js" in html: - raise WafChallengeError( - "Got AWS WAF challenge page — cookies expired" - ) + raise WafChallengeError("Got AWS WAF challenge page — cookies expired") return html if resp.status_code in (202, 403): - raise WafChallengeError( - f"HTTP {resp.status_code} — cookies likely expired" - ) + raise WafChallengeError(f"HTTP {resp.status_code} — cookies likely expired") if resp.status_code in (429, 500, 502, 503, 504): - delay = RETRY_BASE_DELAY * (2**attempt) + delay = RETRY_BASE_DELAY * (2 ** attempt) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, - url, - attempt + 1, - max_retries, - delay, + resp.status_code, url, attempt + 1, max_retries, delay, ) time.sleep(delay) continue @@ -201,14 +187,10 @@ def fetch_page( raise except RequestsError as e: openrent_errors_total.labels(type=type(e).__name__).inc() - delay = RETRY_BASE_DELAY * (2**attempt) + delay = RETRY_BASE_DELAY * (2 ** attempt) log.warning( "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, - url, - attempt + 1, - max_retries, - delay, + type(e).__name__, url, attempt + 1, max_retries, delay, ) time.sleep(delay) @@ -265,9 +247,7 @@ def _extract_bedrooms_from_title(title: str) -> int | None: return None -def _extract_beds_baths_from_features( - feature_items: list, -) -> tuple[int | None, int | None]: +def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]: """Extract bedrooms and bathrooms from feature list items. OpenRent search cards have