This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -52,6 +52,7 @@
"buy = pl.read_parquet(f\"{DATA}/online_listings_buy.parquet\")\n",
"rent = pl.read_parquet(f\"{DATA}/online_listings_rent.parquet\")\n",
"\n",
"\n",
"def tag_source(df: pl.DataFrame) -> pl.DataFrame:\n",
" return df.with_columns(\n",
" pl.when(pl.col(\"Listing URL\").str.contains(\"rightmove\"))\n",
@ -62,6 +63,7 @@
" .alias(\"source\")\n",
" )\n",
"\n",
"\n",
"buy = tag_source(buy)\n",
"rent = tag_source(rent)\n",
"\n",
@ -122,7 +124,7 @@
" print(f\"\\n=== {label} ===\")\n",
" for row in counts.iter_rows():\n",
" src, cnt = row\n",
" print(f\" {src}: {cnt:,} ({cnt/len(df)*100:.1f}%)\")\n",
" print(f\" {src}: {cnt:,} ({cnt / len(df) * 100:.1f}%)\")\n",
"\n",
"# Known dedup count from scraper logs\n",
"CROSS_DEDUP_BUY = 2_220\n",
@ -132,7 +134,7 @@
"print(f\"Home.co.uk scraped (before dedup): {hk_buy_total:,}\")\n",
"print(f\"Home.co.uk unique (after dedup): {hk_buy_unique:,}\")\n",
"print(f\"Cross-source duplicates removed: {CROSS_DEDUP_BUY:,}\")\n",
"print(f\"Overlap rate: {CROSS_DEDUP_BUY/hk_buy_total*100:.1f}%\")"
"print(f\"Overlap rate: {CROSS_DEDUP_BUY / hk_buy_total * 100:.1f}%\")"
]
},
{
@ -987,23 +989,29 @@
"# Venn-style summary\n",
"rm_buy = len(buy.filter(pl.col(\"source\") == \"Rightmove\"))\n",
"\n",
"fig = go.Figure(go.Sankey(\n",
" node=dict(\n",
" label=[\n",
" f\"Rightmove\\n{rm_buy:,}\",\n",
" f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n",
" f\"Merged BUY\\n{len(buy):,}\",\n",
" f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n",
" ],\n",
" color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n",
" ),\n",
" link=dict(\n",
" source=[0, 1, 1],\n",
" target=[2, 2, 3],\n",
" value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n",
" color=[\"rgba(37,99,235,0.3)\", \"rgba(16,185,129,0.3)\", \"rgba(239,68,68,0.3)\"],\n",
" ),\n",
"))\n",
"fig = go.Figure(\n",
" go.Sankey(\n",
" node=dict(\n",
" label=[\n",
" f\"Rightmove\\n{rm_buy:,}\",\n",
" f\"Home.co.uk\\n{hk_buy_total:,} scraped\",\n",
" f\"Merged BUY\\n{len(buy):,}\",\n",
" f\"Deduped\\n{CROSS_DEDUP_BUY:,}\",\n",
" ],\n",
" color=[\"#2563eb\", \"#10b981\", \"#6366f1\", \"#ef4444\"],\n",
" ),\n",
" link=dict(\n",
" source=[0, 1, 1],\n",
" target=[2, 2, 3],\n",
" value=[rm_buy, hk_buy_unique, CROSS_DEDUP_BUY],\n",
" color=[\n",
" \"rgba(37,99,235,0.3)\",\n",
" \"rgba(16,185,129,0.3)\",\n",
" \"rgba(239,68,68,0.3)\",\n",
" ],\n",
" ),\n",
" )\n",
")\n",
"fig.update_layout(title=\"BUY Channel: Source Contribution Flow\", height=350)\n",
"fig.show()"
]
@ -1106,8 +1114,11 @@
"oc_comparison = (\n",
" hk_by_oc.join(rm_by_oc, on=\"outcode\", how=\"left\")\n",
" .with_columns(\n",
" (pl.col(\"hk_count\") / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0)) * 100)\n",
" .alias(\"hk_pct_of_total\")\n",
" (\n",
" pl.col(\"hk_count\")\n",
" / (pl.col(\"hk_count\") + pl.col(\"rm_count\").fill_null(0))\n",
" * 100\n",
" ).alias(\"hk_pct_of_total\")\n",
" )\n",
" .sort(\"hk_count\", descending=True)\n",
")\n",
@ -2215,18 +2226,28 @@
"source": [
"# Bar chart: home.co.uk vs Rightmove counts per outcode\n",
"fig = go.Figure()\n",
"fig.add_trace(go.Bar(\n",
" x=oc_comparison[\"outcode\"], y=oc_comparison[\"rm_count\"],\n",
" name=\"Rightmove\", marker_color=\"#2563eb\",\n",
"))\n",
"fig.add_trace(go.Bar(\n",
" x=oc_comparison[\"outcode\"], y=oc_comparison[\"hk_count\"],\n",
" name=\"Home.co.uk\", marker_color=\"#10b981\",\n",
"))\n",
"fig.add_trace(\n",
" go.Bar(\n",
" x=oc_comparison[\"outcode\"],\n",
" y=oc_comparison[\"rm_count\"],\n",
" name=\"Rightmove\",\n",
" marker_color=\"#2563eb\",\n",
" )\n",
")\n",
"fig.add_trace(\n",
" go.Bar(\n",
" x=oc_comparison[\"outcode\"],\n",
" y=oc_comparison[\"hk_count\"],\n",
" name=\"Home.co.uk\",\n",
" marker_color=\"#10b981\",\n",
" )\n",
")\n",
"fig.update_layout(\n",
" barmode=\"group\", height=400,\n",
" barmode=\"group\",\n",
" height=400,\n",
" title=\"Listings per Outcode: Rightmove vs Home.co.uk (outcodes with HK coverage)\",\n",
" xaxis_title=\"Outcode\", yaxis_title=\"Listings\",\n",
" xaxis_title=\"Outcode\",\n",
" yaxis_title=\"Listings\",\n",
")\n",
"fig.show()"
]
@ -3121,10 +3142,14 @@
"sample = covered.sample(n=min(30_000, len(covered)), seed=42)\n",
"\n",
"fig = px.scatter_map(\n",
" sample.to_pandas(), lat=\"lat\", lon=\"lon\",\n",
" sample.to_pandas(),\n",
" lat=\"lat\",\n",
" lon=\"lon\",\n",
" color=\"source\",\n",
" color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n",
" zoom=7, opacity=0.4, size_max=4,\n",
" zoom=7,\n",
" opacity=0.4,\n",
" size_max=4,\n",
" title=\"Listing Locations in Covered Outcodes (by source)\",\n",
")\n",
"fig.update_layout(height=600)\n",
@ -3188,15 +3213,41 @@
"# For covered outcodes, compare home.co.uk listings against Rightmove\n",
"# to find near-matches (same postcode, same beds, price within 5%)\n",
"\n",
"hk = buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\").select(\n",
" \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n",
").rename({\"Asking price\": \"hk_price\", \"Property type\": \"hk_type\", \"Address per Property Register\": \"hk_addr\"})\n",
"hk = (\n",
" buy_oc.filter(pl.col(\"source\") == \"Home.co.uk\")\n",
" .select(\n",
" \"Postcode\",\n",
" \"Bedrooms\",\n",
" \"Asking price\",\n",
" \"Property type\",\n",
" \"Address per Property Register\",\n",
" )\n",
" .rename(\n",
" {\n",
" \"Asking price\": \"hk_price\",\n",
" \"Property type\": \"hk_type\",\n",
" \"Address per Property Register\": \"hk_addr\",\n",
" }\n",
" )\n",
")\n",
"\n",
"rm = buy_oc.filter(\n",
" pl.col(\"source\") == \"Rightmove\"\n",
").select(\n",
" \"Postcode\", \"Bedrooms\", \"Asking price\", \"Property type\", \"Address per Property Register\"\n",
").rename({\"Asking price\": \"rm_price\", \"Property type\": \"rm_type\", \"Address per Property Register\": \"rm_addr\"})\n",
"rm = (\n",
" buy_oc.filter(pl.col(\"source\") == \"Rightmove\")\n",
" .select(\n",
" \"Postcode\",\n",
" \"Bedrooms\",\n",
" \"Asking price\",\n",
" \"Property type\",\n",
" \"Address per Property Register\",\n",
" )\n",
" .rename(\n",
" {\n",
" \"Asking price\": \"rm_price\",\n",
" \"Property type\": \"rm_type\",\n",
" \"Address per Property Register\": \"rm_addr\",\n",
" }\n",
" )\n",
")\n",
"\n",
"# Join on postcode + bedrooms\n",
"joined = hk.join(rm, on=[\"Postcode\", \"Bedrooms\"], how=\"inner\")\n",
@ -3213,16 +3264,24 @@
"exact = joined.filter(pl.col(\"hk_price\") == pl.col(\"rm_price\"))\n",
"\n",
"print(f\"Home.co.uk listings (unique, in file): {len(hk):,}\")\n",
"print(f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\")\n",
"print(\n",
" f\"Rightmove listings in covered outcodes: {len(rm.filter(pl.col('Postcode').is_in(hk['Postcode']))):,}\"\n",
")\n",
"print()\n",
"print(f\"Joined on (postcode, bedrooms): {len(joined):,} candidate pairs\")\n",
"print(f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\")\n",
"print(f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\")\n",
"print(\n",
" f\" Exact price match: {len(exact):,} pairs (likely same property, different beds or already deduped)\"\n",
")\n",
"print(\n",
" f\" Price within 5%: {len(near):,} pairs (probable duplicates with price rounding)\"\n",
")\n",
"print()\n",
"# Unique hk listings that have at least one near-match\n",
"hk_with_near = near.select(\"hk_price\", \"hk_addr\", \"Postcode\").unique()\n",
"print(f\"Home.co.uk listings with a near-match in RM: ~{len(hk_with_near):,}\")\n",
"print(f\"Estimated additional overlap: ~{len(hk_with_near)/len(hk)*100:.1f}% of unique HK listings\")"
"print(\n",
" f\"Estimated additional overlap: ~{len(hk_with_near) / len(hk) * 100:.1f}% of unique HK listings\"\n",
")"
]
},
{
@ -4178,9 +4237,13 @@
")\n",
"\n",
"fig = px.histogram(\n",
" clipped.to_pandas(), x=\"Asking price\", color=\"source\", nbins=80,\n",
" clipped.to_pandas(),\n",
" x=\"Asking price\",\n",
" color=\"source\",\n",
" nbins=80,\n",
" color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n",
" barmode=\"overlay\", histnorm=\"probability density\",\n",
" barmode=\"overlay\",\n",
" histnorm=\"probability density\",\n",
" title=\"Price Distribution by Source (normalised, £50k£2M)\",\n",
")\n",
"fig.update_traces(opacity=0.6)\n",
@ -5095,10 +5158,7 @@
],
"source": [
"# Property type distribution by source\n",
"type_by_src = (\n",
" buy.group_by(\"source\", \"Property type\")\n",
" .agg(pl.len().alias(\"count\"))\n",
")\n",
"type_by_src = buy.group_by(\"source\", \"Property type\").agg(pl.len().alias(\"count\"))\n",
"# Normalise within each source\n",
"totals = type_by_src.group_by(\"source\").agg(pl.col(\"count\").sum().alias(\"total\"))\n",
"type_by_src = type_by_src.join(totals, on=\"source\").with_columns(\n",
@ -5107,7 +5167,10 @@
"\n",
"fig = px.bar(\n",
" type_by_src.sort(\"Property type\").to_pandas(),\n",
" x=\"Property type\", y=\"pct\", color=\"source\", barmode=\"group\",\n",
" x=\"Property type\",\n",
" y=\"pct\",\n",
" color=\"source\",\n",
" barmode=\"group\",\n",
" color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n",
" title=\"Property Type Distribution by Source (%)\",\n",
")\n",
@ -5186,7 +5249,9 @@
"# Property sub-type comparison — top home.co.uk sub-types\n",
"hk_subtypes = (\n",
" buy.filter(pl.col(\"source\") == \"Home.co.uk\")[\"Property sub-type\"]\n",
" .value_counts().sort(\"count\", descending=True).head(20)\n",
" .value_counts()\n",
" .sort(\"count\", descending=True)\n",
" .head(20)\n",
")\n",
"print(\"Top 20 Home.co.uk property sub-types:\")\n",
"hk_subtypes"
@ -5263,9 +5328,16 @@
"source": [
"# Field completeness by source\n",
"fields = [\n",
" \"Bedrooms\", \"Bathrooms\", \"Postcode\", \"Address per Property Register\",\n",
" \"Leasehold/Freehold\", \"Property type\", \"Total floor area (sqm)\",\n",
" \"Listing date\", \"Asking price\", \"Price qualifier\",\n",
" \"Bedrooms\",\n",
" \"Bathrooms\",\n",
" \"Postcode\",\n",
" \"Address per Property Register\",\n",
" \"Leasehold/Freehold\",\n",
" \"Property type\",\n",
" \"Total floor area (sqm)\",\n",
" \"Listing date\",\n",
" \"Asking price\",\n",
" \"Price qualifier\",\n",
"]\n",
"\n",
"rows = []\n",
@ -5276,17 +5348,19 @@
" non_null = n - subset[f].null_count()\n",
" # Also count empty strings as missing for string fields\n",
" if subset[f].dtype == pl.Utf8:\n",
" non_null = len(subset.filter(\n",
" pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0)\n",
" ))\n",
" non_null = len(\n",
" subset.filter(pl.col(f).is_not_null() & (pl.col(f).str.len_chars() > 0))\n",
" )\n",
" rows.append({\"source\": src, \"field\": f, \"pct_available\": non_null / n * 100})\n",
"\n",
"completeness = pl.DataFrame(rows)\n",
"pivot = completeness.pivot(on=\"source\", index=\"field\", values=\"pct_available\")\n",
"pivot = pivot.with_columns([\n",
" pl.col(\"Rightmove\").round(1),\n",
" pl.col(\"Home.co.uk\").round(1),\n",
"])\n",
"pivot = pivot.with_columns(\n",
" [\n",
" pl.col(\"Rightmove\").round(1),\n",
" pl.col(\"Home.co.uk\").round(1),\n",
" ]\n",
")\n",
"print(\"Field completeness (% non-null/non-empty):\")\n",
"pivot"
]
@ -6198,19 +6272,26 @@
"# Bedroom distribution comparison\n",
"fig = make_subplots(rows=1, cols=2, subplot_titles=(\"Rightmove\", \"Home.co.uk\"))\n",
"for i, src in enumerate([\"Rightmove\", \"Home.co.uk\"], 1):\n",
" beds = buy.filter(\n",
" (pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8)\n",
" )[\"Bedrooms\"].value_counts().sort(\"Bedrooms\")\n",
" beds = (\n",
" buy.filter((pl.col(\"source\") == src) & (pl.col(\"Bedrooms\") <= 8))[\"Bedrooms\"]\n",
" .value_counts()\n",
" .sort(\"Bedrooms\")\n",
" )\n",
" # Normalise\n",
" total = beds[\"count\"].sum()\n",
" fig.add_trace(\n",
" go.Bar(\n",
" x=beds[\"Bedrooms\"], y=beds[\"count\"] / total * 100,\n",
" x=beds[\"Bedrooms\"],\n",
" y=beds[\"count\"] / total * 100,\n",
" name=src,\n",
" marker_color=\"#2563eb\" if src == \"Rightmove\" else \"#10b981\",\n",
" ), row=1, col=i,\n",
" ),\n",
" row=1,\n",
" col=i,\n",
" )\n",
"fig.update_layout(height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False)\n",
"fig.update_layout(\n",
" height=350, title=\"Bedroom Distribution by Source (%)\", showlegend=False\n",
")\n",
"fig.update_yaxes(title_text=\"%\", row=1, col=1)\n",
"fig.show()"
]
@ -6287,17 +6368,23 @@
"\n",
"comparison_rows = []\n",
"for ptype in [\"Detached\", \"Semi-Detached\", \"Terraced\", \"Flats/Maisonettes\", \"Other\"]:\n",
" rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n",
" rm_p = rm_covered.filter(pl.col(\"Property type\") == ptype)[\n",
" \"Asking price\"\n",
" ].drop_nulls()\n",
" hk_p = hk_only.filter(pl.col(\"Property type\") == ptype)[\"Asking price\"].drop_nulls()\n",
" if len(rm_p) > 0 and len(hk_p) > 0:\n",
" comparison_rows.append({\n",
" \"Property type\": ptype,\n",
" \"RM count\": len(rm_p),\n",
" \"RM median £\": int(rm_p.median()),\n",
" \"HK count\": len(hk_p),\n",
" \"HK median £\": int(hk_p.median()),\n",
" \"HK premium %\": round((hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1),\n",
" })\n",
" comparison_rows.append(\n",
" {\n",
" \"Property type\": ptype,\n",
" \"RM count\": len(rm_p),\n",
" \"RM median £\": int(rm_p.median()),\n",
" \"HK count\": len(hk_p),\n",
" \"HK median £\": int(hk_p.median()),\n",
" \"HK premium %\": round(\n",
" (hk_p.median() - rm_p.median()) / rm_p.median() * 100, 1\n",
" ),\n",
" }\n",
" )\n",
"\n",
"comp = pl.DataFrame(comparison_rows)\n",
"print(\"Price comparison in covered outcodes (Home.co.uk unique listings vs Rightmove):\")\n",
@ -7245,9 +7332,13 @@
"# Listing age histogram comparison\n",
"age_plot = with_age.filter(pl.col(\"days_on_market\") <= 730) # cap at 2 years\n",
"fig = px.histogram(\n",
" age_plot.to_pandas(), x=\"days_on_market\", color=\"source\", nbins=60,\n",
" age_plot.to_pandas(),\n",
" x=\"days_on_market\",\n",
" color=\"source\",\n",
" nbins=60,\n",
" color_discrete_map={\"Rightmove\": \"#2563eb\", \"Home.co.uk\": \"#10b981\"},\n",
" barmode=\"overlay\", histnorm=\"probability density\",\n",
" barmode=\"overlay\",\n",
" histnorm=\"probability density\",\n",
" title=\"Days on Market Distribution by Source (normalised, capped at 2 years)\",\n",
")\n",
"fig.update_traces(opacity=0.6)\n",
@ -7330,7 +7421,9 @@
"print(f\" Projected home.co.uk total: ~{projected_hk:,}\")\n",
"print(f\" Projected cross-dedup: ~{projected_dedup:,}\")\n",
"print(f\" Projected unique additions: ~{projected_unique:,}\")\n",
"print(f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique/rm_buy*100:.1f}% increase)\")\n",
"print(\n",
" f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique / rm_buy * 100:.1f}% increase)\"\n",
")\n",
"print()\n",
"print(\"⚠️ These are rough estimates — the covered outcodes may not be representative\")"
]