This commit is contained in:
Andras Schmelczer 2026-03-15 17:38:26 +00:00
parent 80c093b7ba
commit f72c43a9fa
101 changed files with 2168 additions and 1177 deletions

View file

@ -20,7 +20,6 @@
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"from plotly.subplots import make_subplots\n",
"\n",
"pl.Config.set_tbl_rows(20)\n",
"pl.Config.set_fmt_str_lengths(50)\n",

View file

@ -48,7 +48,6 @@
"import polars as pl\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"from plotly.subplots import make_subplots\n",
"\n",
"pl.Config.set_tbl_rows(20)\n",
"pl.Config.set_fmt_str_lengths(80)\n",
@ -265,7 +264,7 @@
],
"source": [
"price = clean[\"price\"]\n",
"print(f\"Price stats:\")\n",
"print(\"Price stats:\")\n",
"print(f\" Min: £{price.min():,}\")\n",
"print(f\" P5: £{price.quantile(0.05):,.0f}\")\n",
"print(f\" P25: £{price.quantile(0.25):,.0f}\")\n",
@ -590891,7 +590890,7 @@
")\n",
"\n",
"age = with_age[\"days_on_market\"].drop_nulls()\n",
"print(f\"Days on market stats:\")\n",
"print(\"Days on market stats:\")\n",
"print(f\" Median: {age.median():.0f} days\")\n",
"print(f\" Mean: {age.mean():.0f} days\")\n",
"print(f\" P25: {age.quantile(0.25):.0f} days\")\n",
@ -594786,20 +594785,20 @@
}
],
"source": [
"print(f\"=== Rightmove Buy Listings Summary ===\")\n",
"print(\"=== Rightmove Buy Listings Summary ===\")\n",
"print(f\"Total listings: {len(clean):,}\")\n",
"print(f\"Outcodes covered: {clean['outcode'].n_unique():,}\")\n",
"print(f\"\")\n",
"print(\"\")\n",
"print(f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\")\n",
"print(f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\")\n",
"print(f\"\")\n",
"print(\"\")\n",
"print(f\"Tenure known: {(len(clean) - clean['tenure'].null_count())/len(clean)*100:.1f}%\")\n",
"print(f\" Freehold: {len(clean.filter(pl.col('tenure') == 'Freehold')):,}\")\n",
"print(f\" Leasehold: {len(clean.filter(pl.col('tenure') == 'Leasehold')):,}\")\n",
"print(f\"\")\n",
"print(\"\")\n",
"print(f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%\")\n",
"print(f\"\")\n",
"print(f\"Property types:\")\n",
"print(\"\")\n",
"print(\"Property types:\")\n",
"for row in clean['property_type'].value_counts().sort('count', descending=True).iter_rows():\n",
" print(f\" {row[0]}: {row[1]:,} ({row[1]/len(clean)*100:.1f}%)\")"
]

View file

@ -128,7 +128,7 @@
"CROSS_DEDUP_BUY = 2_220\n",
"hk_buy_total = len(buy.filter(pl.col(\"source\") == \"Home.co.uk\")) + CROSS_DEDUP_BUY\n",
"hk_buy_unique = len(buy.filter(pl.col(\"source\") == \"Home.co.uk\"))\n",
"print(f\"\\n--- BUY overlap analysis ---\")\n",
"print(\"\\n--- BUY overlap analysis ---\")\n",
"print(f\"Home.co.uk scraped (before dedup): {hk_buy_total:,}\")\n",
"print(f\"Home.co.uk unique (after dedup): {hk_buy_unique:,}\")\n",
"print(f\"Cross-source duplicates removed: {CROSS_DEDUP_BUY:,}\")\n",
@ -1114,7 +1114,7 @@
"\n",
"print(f\"Outcodes with home.co.uk listings: {len(oc_comparison)}\")\n",
"print(f\"Total outcodes in dataset: {buy_oc['outcode'].drop_nulls().n_unique()}\")\n",
"print(f\"\\nHome.co.uk coverage by outcode:\")\n",
"print(\"\\nHome.co.uk coverage by outcode:\")\n",
"oc_comparison"
]
},
@ -7315,7 +7315,7 @@
"print(f\"Covered outcodes: {covered_count}\")\n",
"print(f\"Total outcodes: {total_outcodes}\")\n",
"print()\n",
"print(f\"In covered outcodes:\")\n",
"print(\"In covered outcodes:\")\n",
"print(f\" Rightmove: {rm_in_covered:,} listings\")\n",
"print(f\" Home.co.uk: {hk_buy_unique:,} unique listings\")\n",
"print(f\" HK/RM ratio: {ratio_in_covered:.2f}\")\n",
@ -7326,13 +7326,13 @@
"projected_dedup = int(projected_hk * CROSS_DEDUP_BUY / hk_buy_total)\n",
"projected_unique = projected_hk - projected_dedup\n",
"\n",
"print(f\"--- Projected full-coverage estimates ---\")\n",
"print(\"--- Projected full-coverage estimates ---\")\n",
"print(f\" Projected home.co.uk total: ~{projected_hk:,}\")\n",
"print(f\" Projected cross-dedup: ~{projected_dedup:,}\")\n",
"print(f\" Projected unique additions: ~{projected_unique:,}\")\n",
"print(f\" Projected merged dataset: ~{rm_buy + projected_unique:,} ({projected_unique/rm_buy*100:.1f}% increase)\")\n",
"print()\n",
"print(f\"⚠️ These are rough estimates — the covered outcodes may not be representative\")"
"print(\"⚠️ These are rough estimates — the covered outcodes may not be representative\")"
]
},
{