Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -52,7 +52,9 @@
|
|||
"pl.Config.set_tbl_rows(20)\n",
|
||||
"pl.Config.set_fmt_str_lengths(80)\n",
|
||||
"\n",
|
||||
"df = pl.read_parquet(\"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\")\n",
|
||||
"df = pl.read_parquet(\n",
|
||||
" \"/volumes/syncthing/Projects/property-map/property-data/rightmove_buy.parquet\"\n",
|
||||
")\n",
|
||||
"schema = df.schema\n",
|
||||
"print(f\"Total rows: {len(df):,}\")\n",
|
||||
"print(f\"Columns ({len(schema)}):\")\n",
|
||||
|
|
@ -150,11 +152,13 @@
|
|||
],
|
||||
"source": [
|
||||
"# Null counts\n",
|
||||
"null_df = pl.DataFrame({\n",
|
||||
" \"column\": df.columns,\n",
|
||||
" \"nulls\": [df[c].null_count() for c in df.columns],\n",
|
||||
" \"pct\": [f\"{df[c].null_count()/len(df)*100:.1f}%\" for c in df.columns],\n",
|
||||
"})\n",
|
||||
"null_df = pl.DataFrame(\n",
|
||||
" {\n",
|
||||
" \"column\": df.columns,\n",
|
||||
" \"nulls\": [df[c].null_count() for c in df.columns],\n",
|
||||
" \"pct\": [f\"{df[c].null_count() / len(df) * 100:.1f}%\" for c in df.columns],\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"null_df.filter(pl.col(\"nulls\") > 0)"
|
||||
]
|
||||
},
|
||||
|
|
@ -197,13 +201,17 @@
|
|||
" \"price = 0\": len(df.filter(pl.col(\"price\") == 0)),\n",
|
||||
" \"price > 50M\": len(df.filter(pl.col(\"price\") > 50_000_000)),\n",
|
||||
" \"floorspace > 10,000 sqm\": len(df.filter(pl.col(\"floorspace_sqm\") > 10_000)),\n",
|
||||
" \"latitude outside UK (< 49 or > 61)\": len(df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))),\n",
|
||||
" \"longitude outside UK (< -8 or > 2)\": len(df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))),\n",
|
||||
" \"latitude outside UK (< 49 or > 61)\": len(\n",
|
||||
" df.filter((pl.col(\"latitude\") < 49) | (pl.col(\"latitude\") > 61))\n",
|
||||
" ),\n",
|
||||
" \"longitude outside UK (< -8 or > 2)\": len(\n",
|
||||
" df.filter((pl.col(\"longitude\") < -8) | (pl.col(\"longitude\") > 2))\n",
|
||||
" ),\n",
|
||||
" \"house_share = true\": len(df.filter(pl.col(\"house_share\"))),\n",
|
||||
"}\n",
|
||||
"print(\"Data quality issues:\")\n",
|
||||
"for desc, count in issues.items():\n",
|
||||
" print(f\" {desc}: {count:,} ({count/len(df)*100:.2f}%)\")"
|
||||
" print(f\" {desc}: {count:,} ({count / len(df) * 100:.2f}%)\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -230,7 +238,7 @@
|
|||
" & (pl.col(\"longitude\") >= -8)\n",
|
||||
" & (pl.col(\"longitude\") <= 2)\n",
|
||||
")\n",
|
||||
"print(f\"Clean rows: {len(clean):,} ({len(clean)/len(df)*100:.1f}% of original)\")"
|
||||
"print(f\"Clean rows: {len(clean):,} ({len(clean) / len(df) * 100:.1f}% of original)\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1126,8 +1134,12 @@
|
|||
"# Price histogram (clipped to 2nd-98th percentile)\n",
|
||||
"lo, hi = price.quantile(0.02), price.quantile(0.98)\n",
|
||||
"clipped = clean.filter((pl.col(\"price\") >= lo) & (pl.col(\"price\") <= hi))\n",
|
||||
"fig = px.histogram(clipped.to_pandas(), x=\"price\", nbins=80,\n",
|
||||
" title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\")\n",
|
||||
"fig = px.histogram(\n",
|
||||
" clipped.to_pandas(),\n",
|
||||
" x=\"price\",\n",
|
||||
" nbins=80,\n",
|
||||
" title=f\"Asking Price Distribution (£{lo:,.0f} - £{hi:,.0f}, 2nd-98th pctl)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400, xaxis_title=\"Asking Price (£)\", yaxis_title=\"Count\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -439978,9 +439990,13 @@
|
|||
],
|
||||
"source": [
|
||||
"# Price by property type\n",
|
||||
"fig = px.box(clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n",
|
||||
" x=\"property_type\", y=\"price\", color=\"property_type\",\n",
|
||||
" title=\"Price by Property Type (capped at £2M for readability)\")\n",
|
||||
"fig = px.box(\n",
|
||||
" clean.filter(pl.col(\"price\") <= 2_000_000).to_pandas(),\n",
|
||||
" x=\"property_type\",\n",
|
||||
" y=\"price\",\n",
|
||||
" color=\"property_type\",\n",
|
||||
" title=\"Price by Property Type (capped at £2M for readability)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=500, showlegend=False, yaxis_title=\"Price (£)\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -440079,9 +440095,7 @@
|
|||
"source": [
|
||||
"# Price qualifier breakdown\n",
|
||||
"pq = clean[\"price_qualifier\"].value_counts().sort(\"count\", descending=True)\n",
|
||||
"pq = pq.with_columns(\n",
|
||||
" (pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\")\n",
|
||||
")\n",
|
||||
"pq = pq.with_columns((pl.col(\"count\") / pl.col(\"count\").sum() * 100).alias(\"pct\"))\n",
|
||||
"pq"
|
||||
]
|
||||
},
|
||||
|
|
@ -440928,8 +440942,12 @@
|
|||
"source": [
|
||||
"# Property type distribution\n",
|
||||
"type_counts = clean[\"property_type\"].value_counts().sort(\"count\", descending=True)\n",
|
||||
"fig = px.pie(type_counts.to_pandas(), names=\"property_type\", values=\"count\",\n",
|
||||
" title=\"Property Type Distribution\")\n",
|
||||
"fig = px.pie(\n",
|
||||
" type_counts.to_pandas(),\n",
|
||||
" names=\"property_type\",\n",
|
||||
" values=\"count\",\n",
|
||||
" title=\"Property Type Distribution\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -441805,9 +441823,16 @@
|
|||
],
|
||||
"source": [
|
||||
"# Top 20 sub-types\n",
|
||||
"sub_counts = clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n",
|
||||
"fig = px.bar(sub_counts.to_pandas(), x=\"count\", y=\"property_sub_type\", orientation=\"h\",\n",
|
||||
" title=\"Top 20 Property Sub-types\")\n",
|
||||
"sub_counts = (\n",
|
||||
" clean[\"property_sub_type\"].value_counts().sort(\"count\", descending=True).head(20)\n",
|
||||
")\n",
|
||||
"fig = px.bar(\n",
|
||||
" sub_counts.to_pandas(),\n",
|
||||
" x=\"count\",\n",
|
||||
" y=\"property_sub_type\",\n",
|
||||
" orientation=\"h\",\n",
|
||||
" title=\"Top 20 Property Sub-types\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=600, yaxis={\"categoryorder\": \"total ascending\"})\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -442643,9 +442668,15 @@
|
|||
],
|
||||
"source": [
|
||||
"# Tenure split\n",
|
||||
"tenure_counts = clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n",
|
||||
"fig = px.pie(tenure_counts.to_pandas(), names=\"tenure\", values=\"count\",\n",
|
||||
" title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count()/len(clean)*100:.1f}% missing)\")\n",
|
||||
"tenure_counts = (\n",
|
||||
" clean[\"tenure\"].drop_nulls().value_counts().sort(\"count\", descending=True)\n",
|
||||
")\n",
|
||||
"fig = px.pie(\n",
|
||||
" tenure_counts.to_pandas(),\n",
|
||||
" names=\"tenure\",\n",
|
||||
" values=\"count\",\n",
|
||||
" title=f\"Tenure Split ({clean['tenure'].null_count():,} unknown / {clean['tenure'].null_count() / len(clean) * 100:.1f}% missing)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -443546,8 +443577,14 @@
|
|||
" .agg(pl.len().alias(\"count\"))\n",
|
||||
" .sort(\"property_type\")\n",
|
||||
")\n",
|
||||
"fig = px.bar(tenure_by_type.to_pandas(), x=\"property_type\", y=\"count\", color=\"tenure\",\n",
|
||||
" barmode=\"group\", title=\"Tenure by Property Type\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" tenure_by_type.to_pandas(),\n",
|
||||
" x=\"property_type\",\n",
|
||||
" y=\"count\",\n",
|
||||
" color=\"tenure\",\n",
|
||||
" barmode=\"group\",\n",
|
||||
" title=\"Tenure by Property Type\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -444412,9 +444449,12 @@
|
|||
],
|
||||
"source": [
|
||||
"# Bedroom distribution\n",
|
||||
"bed_counts = clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n",
|
||||
"fig = px.bar(bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\",\n",
|
||||
" title=\"Bedroom Count Distribution\")\n",
|
||||
"bed_counts = (\n",
|
||||
" clean.filter(pl.col(\"bedrooms\") <= 10)[\"bedrooms\"].value_counts().sort(\"bedrooms\")\n",
|
||||
")\n",
|
||||
"fig = px.bar(\n",
|
||||
" bed_counts.to_pandas(), x=\"bedrooms\", y=\"count\", title=\"Bedroom Count Distribution\"\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -445279,16 +445319,25 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"fig = go.Figure()\n",
|
||||
"fig.add_trace(go.Bar(\n",
|
||||
" x=price_by_beds[\"bedrooms\"], y=price_by_beds[\"median_price\"],\n",
|
||||
" name=\"Median\", error_y=dict(type=\"data\",\n",
|
||||
" symmetric=False,\n",
|
||||
" array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n",
|
||||
" arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list()\n",
|
||||
"fig.add_trace(\n",
|
||||
" go.Bar(\n",
|
||||
" x=price_by_beds[\"bedrooms\"],\n",
|
||||
" y=price_by_beds[\"median_price\"],\n",
|
||||
" name=\"Median\",\n",
|
||||
" error_y=dict(\n",
|
||||
" type=\"data\",\n",
|
||||
" symmetric=False,\n",
|
||||
" array=(price_by_beds[\"p75\"] - price_by_beds[\"median_price\"]).to_list(),\n",
|
||||
" arrayminus=(price_by_beds[\"median_price\"] - price_by_beds[\"p25\"]).to_list(),\n",
|
||||
" ),\n",
|
||||
" )\n",
|
||||
"))\n",
|
||||
"fig.update_layout(title=\"Median Price by Bedrooms (with IQR)\", height=400,\n",
|
||||
" xaxis_title=\"Bedrooms\", yaxis_title=\"Price (£)\")\n",
|
||||
")\n",
|
||||
"fig.update_layout(\n",
|
||||
" title=\"Median Price by Bedrooms (with IQR)\",\n",
|
||||
" height=400,\n",
|
||||
" xaxis_title=\"Bedrooms\",\n",
|
||||
" yaxis_title=\"Price (£)\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
|
|
@ -446263,8 +446312,14 @@
|
|||
" .agg(pl.len().alias(\"count\"))\n",
|
||||
" .sort(\"property_type\", \"bedrooms\")\n",
|
||||
")\n",
|
||||
"fig = px.bar(beds_by_type.to_pandas(), x=\"bedrooms\", y=\"count\", color=\"property_type\",\n",
|
||||
" barmode=\"group\", title=\"Bedroom Distribution by Property Type\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" beds_by_type.to_pandas(),\n",
|
||||
" x=\"bedrooms\",\n",
|
||||
" y=\"count\",\n",
|
||||
" color=\"property_type\",\n",
|
||||
" barmode=\"group\",\n",
|
||||
" title=\"Bedroom Distribution by Property Type\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=450)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -446323,19 +446378,26 @@
|
|||
],
|
||||
"source": [
|
||||
"# Floorspace availability by property type\n",
|
||||
"has_floor = clean.with_columns(pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\"))\n",
|
||||
"floor_by_type = (\n",
|
||||
" has_floor.group_by(\"property_type\", \"has_floorspace\")\n",
|
||||
" .agg(pl.len().alias(\"count\"))\n",
|
||||
"has_floor = clean.with_columns(\n",
|
||||
" pl.col(\"floorspace_sqm\").is_not_null().alias(\"has_floorspace\")\n",
|
||||
")\n",
|
||||
"floor_by_type = has_floor.group_by(\"property_type\", \"has_floorspace\").agg(\n",
|
||||
" pl.len().alias(\"count\")\n",
|
||||
")\n",
|
||||
"totals = floor_by_type.group_by(\"property_type\").agg(\n",
|
||||
" pl.col(\"count\").sum().alias(\"total\")\n",
|
||||
")\n",
|
||||
"totals = floor_by_type.group_by(\"property_type\").agg(pl.col(\"count\").sum().alias(\"total\"))\n",
|
||||
"floor_pct = (\n",
|
||||
" floor_by_type.filter(pl.col(\"has_floorspace\"))\n",
|
||||
" .join(totals, on=\"property_type\")\n",
|
||||
" .with_columns((pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\"))\n",
|
||||
" .with_columns(\n",
|
||||
" (pl.col(\"count\") / pl.col(\"total\") * 100).alias(\"pct_with_floorspace\")\n",
|
||||
" )\n",
|
||||
" .sort(\"pct_with_floorspace\", descending=True)\n",
|
||||
")\n",
|
||||
"print(f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%)\")\n",
|
||||
"print(\n",
|
||||
" f\"Overall floorspace availability: {clean['floorspace_sqm'].drop_nulls().len():,} / {len(clean):,} ({clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%)\"\n",
|
||||
")\n",
|
||||
"floor_pct.select(\"property_type\", \"count\", \"total\", \"pct_with_floorspace\")"
|
||||
]
|
||||
},
|
||||
|
|
@ -447298,8 +447360,13 @@
|
|||
")\n",
|
||||
"print(f\"Properties with reasonable floorspace (10-1000 sqm): {len(with_floor):,}\")\n",
|
||||
"\n",
|
||||
"fig = px.histogram(with_floor.to_pandas(), x=\"floorspace_sqm\", nbins=80, color=\"property_type\",\n",
|
||||
" title=\"Floorspace Distribution by Property Type\")\n",
|
||||
"fig = px.histogram(\n",
|
||||
" with_floor.to_pandas(),\n",
|
||||
" x=\"floorspace_sqm\",\n",
|
||||
" nbins=80,\n",
|
||||
" color=\"property_type\",\n",
|
||||
" title=\"Floorspace Distribution by Property Type\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=450, xaxis_title=\"Floorspace (sqm)\", barmode=\"overlay\")\n",
|
||||
"fig.update_traces(opacity=0.6)\n",
|
||||
"fig.show()"
|
||||
|
|
@ -448176,8 +448243,12 @@
|
|||
"print(f\" P25: £{s.quantile(0.25):,.0f}/sqm\")\n",
|
||||
"print(f\" P75: £{s.quantile(0.75):,.0f}/sqm\")\n",
|
||||
"\n",
|
||||
"fig = px.histogram(ppsqm.to_pandas(), x=\"price_per_sqm\", nbins=80,\n",
|
||||
" title=\"Price per Square Metre Distribution\")\n",
|
||||
"fig = px.histogram(\n",
|
||||
" ppsqm.to_pandas(),\n",
|
||||
" x=\"price_per_sqm\",\n",
|
||||
" nbins=80,\n",
|
||||
" title=\"Price per Square Metre Distribution\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400, xaxis_title=\"Price per sqm (£)\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -584906,8 +584977,13 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"fig = px.box(ppsqm.to_pandas(), x=\"property_type\", y=\"price_per_sqm\", color=\"property_type\",\n",
|
||||
" title=\"Price per sqm by Property Type\")\n",
|
||||
"fig = px.box(\n",
|
||||
" ppsqm.to_pandas(),\n",
|
||||
" x=\"property_type\",\n",
|
||||
" y=\"price_per_sqm\",\n",
|
||||
" color=\"property_type\",\n",
|
||||
" title=\"Price per sqm by Property Type\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=450, showlegend=False, yaxis_title=\"£ per sqm\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -585865,9 +585941,15 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"top30 = outcode_stats.head(30)\n",
|
||||
"fig = px.bar(top30.to_pandas(), x=\"count\", y=\"outcode\", orientation=\"h\",\n",
|
||||
" color=\"median_price\", color_continuous_scale=\"Viridis\",\n",
|
||||
" title=\"Top 30 Outcodes by Listing Volume\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" top30.to_pandas(),\n",
|
||||
" x=\"count\",\n",
|
||||
" y=\"outcode\",\n",
|
||||
" orientation=\"h\",\n",
|
||||
" color=\"median_price\",\n",
|
||||
" color_continuous_scale=\"Viridis\",\n",
|
||||
" title=\"Top 30 Outcodes by Listing Volume\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"})\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -587400,11 +587482,25 @@
|
|||
],
|
||||
"source": [
|
||||
"# Most expensive outcodes (min 50 listings)\n",
|
||||
"expensive = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\", descending=True).head(30)\n",
|
||||
"fig = px.bar(expensive.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n",
|
||||
" color=\"count\", color_continuous_scale=\"Blues\",\n",
|
||||
" title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\")\n",
|
||||
"fig.update_layout(height=700, yaxis={\"categoryorder\": \"total ascending\"}, xaxis_title=\"Median Price (£)\")\n",
|
||||
"expensive = (\n",
|
||||
" outcode_stats.filter(pl.col(\"count\") >= 50)\n",
|
||||
" .sort(\"median_price\", descending=True)\n",
|
||||
" .head(30)\n",
|
||||
")\n",
|
||||
"fig = px.bar(\n",
|
||||
" expensive.to_pandas(),\n",
|
||||
" x=\"median_price\",\n",
|
||||
" y=\"outcode\",\n",
|
||||
" orientation=\"h\",\n",
|
||||
" color=\"count\",\n",
|
||||
" color_continuous_scale=\"Blues\",\n",
|
||||
" title=\"Top 30 Most Expensive Outcodes (min 50 listings, by median price)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(\n",
|
||||
" height=700,\n",
|
||||
" yaxis={\"categoryorder\": \"total ascending\"},\n",
|
||||
" xaxis_title=\"Median Price (£)\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
|
|
@ -588914,10 +589010,20 @@
|
|||
"source": [
|
||||
"# Cheapest outcodes (min 50 listings)\n",
|
||||
"cheapest = outcode_stats.filter(pl.col(\"count\") >= 50).sort(\"median_price\").head(30)\n",
|
||||
"fig = px.bar(cheapest.to_pandas(), x=\"median_price\", y=\"outcode\", orientation=\"h\",\n",
|
||||
" color=\"count\", color_continuous_scale=\"Blues\",\n",
|
||||
" title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\")\n",
|
||||
"fig.update_layout(height=700, yaxis={\"categoryorder\": \"total descending\"}, xaxis_title=\"Median Price (£)\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" cheapest.to_pandas(),\n",
|
||||
" x=\"median_price\",\n",
|
||||
" y=\"outcode\",\n",
|
||||
" orientation=\"h\",\n",
|
||||
" color=\"count\",\n",
|
||||
" color_continuous_scale=\"Blues\",\n",
|
||||
" title=\"Top 30 Cheapest Outcodes (min 50 listings, by median price)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(\n",
|
||||
" height=700,\n",
|
||||
" yaxis={\"categoryorder\": \"total descending\"},\n",
|
||||
" xaxis_title=\"Median Price (£)\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
|
|
@ -589828,14 +589934,19 @@
|
|||
"source": [
|
||||
"# Geographic scatter of listings (sample for performance)\n",
|
||||
"sample = clean.sample(n=min(20_000, len(clean)), seed=42)\n",
|
||||
"fig = px.scatter_map(sample.to_pandas(),\n",
|
||||
" lat=\"latitude\", lon=\"longitude\",\n",
|
||||
" color=\"price\", size_max=4,\n",
|
||||
" color_continuous_scale=\"Viridis\",\n",
|
||||
" range_color=[100_000, 1_500_000],\n",
|
||||
" zoom=5, center={\"lat\": 52.5, \"lon\": -1.5},\n",
|
||||
" title=\"Listing Locations (20k sample, colored by price)\",\n",
|
||||
" opacity=0.4)\n",
|
||||
"fig = px.scatter_map(\n",
|
||||
" sample.to_pandas(),\n",
|
||||
" lat=\"latitude\",\n",
|
||||
" lon=\"longitude\",\n",
|
||||
" color=\"price\",\n",
|
||||
" size_max=4,\n",
|
||||
" color_continuous_scale=\"Viridis\",\n",
|
||||
" range_color=[100_000, 1_500_000],\n",
|
||||
" zoom=5,\n",
|
||||
" center={\"lat\": 52.5, \"lon\": -1.5},\n",
|
||||
" title=\"Listing Locations (20k sample, colored by price)\",\n",
|
||||
" opacity=0.4,\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=700)\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -589864,7 +589975,9 @@
|
|||
"source": [
|
||||
"# Parse dates and look at listing age\n",
|
||||
"with_dates = clean.with_columns(\n",
|
||||
" pl.col(\"first_visible_date\").str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\").alias(\"listed_at\"),\n",
|
||||
" pl.col(\"first_visible_date\")\n",
|
||||
" .str.to_datetime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
|
||||
" .alias(\"listed_at\"),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Date range: {with_dates['listed_at'].min()} to {with_dates['listed_at'].max()}\")"
|
||||
|
|
@ -590856,8 +590969,9 @@
|
|||
" .sort(\"month\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig = px.bar(monthly.to_pandas(), x=\"month\", y=\"count\",\n",
|
||||
" title=\"Listings by Month Listed\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" monthly.to_pandas(), x=\"month\", y=\"count\", title=\"Listings by Month Listed\"\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400, xaxis_title=\"Month\", yaxis_title=\"Listings\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -590884,6 +590998,7 @@
|
|||
"source": [
|
||||
"# How old are current listings? (days since first visible)\n",
|
||||
"import datetime\n",
|
||||
"\n",
|
||||
"now = datetime.datetime(2026, 2, 14)\n",
|
||||
"with_age = with_dates.with_columns(\n",
|
||||
" ((pl.lit(now) - pl.col(\"listed_at\")).dt.total_days()).alias(\"days_on_market\")\n",
|
||||
|
|
@ -590896,7 +591011,7 @@
|
|||
"print(f\" P25: {age.quantile(0.25):.0f} days\")\n",
|
||||
"print(f\" P75: {age.quantile(0.75):.0f} days\")\n",
|
||||
"print(f\" P95: {age.quantile(0.95):.0f} days\")\n",
|
||||
"print(f\" Max: {age.max():.0f} days ({age.max()/365:.1f} years)\")"
|
||||
"print(f\" Max: {age.max():.0f} days ({age.max() / 365:.1f} years)\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -591749,8 +591864,12 @@
|
|||
"source": [
|
||||
"# Days on market distribution (cap at 2 years for readability)\n",
|
||||
"capped = with_age.filter(pl.col(\"days_on_market\") <= 730)\n",
|
||||
"fig = px.histogram(capped.to_pandas(), x=\"days_on_market\", nbins=100,\n",
|
||||
" title=\"Days on Market Distribution (capped at 2 years)\")\n",
|
||||
"fig = px.histogram(\n",
|
||||
" capped.to_pandas(),\n",
|
||||
" x=\"days_on_market\",\n",
|
||||
" nbins=100,\n",
|
||||
" title=\"Days on Market Distribution (capped at 2 years)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=400, xaxis_title=\"Days on Market\", yaxis_title=\"Count\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -591883,11 +592002,13 @@
|
|||
"# Explode features list and count most common\n",
|
||||
"features_exploded = clean.select(\"features\").explode(\"features\").drop_nulls()\n",
|
||||
"print(f\"Total feature entries: {len(features_exploded):,}\")\n",
|
||||
"print(f\"Features per listing: {len(features_exploded)/len(clean):.1f} avg\")\n",
|
||||
"print(f\"Features per listing: {len(features_exploded) / len(clean):.1f} avg\")\n",
|
||||
"\n",
|
||||
"# Most common features (lowercased for grouping)\n",
|
||||
"feature_counts = (\n",
|
||||
" features_exploded.with_columns(pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\"))\n",
|
||||
" features_exploded.with_columns(\n",
|
||||
" pl.col(\"features\").str.to_lowercase().str.strip_chars().alias(\"feature_lower\")\n",
|
||||
" )\n",
|
||||
" .group_by(\"feature_lower\")\n",
|
||||
" .agg(pl.len().alias(\"count\"))\n",
|
||||
" .sort(\"count\", descending=True)\n",
|
||||
|
|
@ -592794,16 +592915,64 @@
|
|||
"all_features = features_exploded[\"features\"].to_list()\n",
|
||||
"word_counter = Counter()\n",
|
||||
"for feat in all_features:\n",
|
||||
" words = re.findall(r'[a-z]+', feat.lower())\n",
|
||||
" words = re.findall(r\"[a-z]+\", feat.lower())\n",
|
||||
" word_counter.update(words)\n",
|
||||
"\n",
|
||||
"# Filter out very short/common words\n",
|
||||
"stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'with', 'for', 'on', 'at', 'by', 'is', 'it', 'from', 'as', 'be', 'this', 'that', 'are', 'was', 'has', 'have', 'not', 'but', 'all', 'can', 'had', 'her', 'his', 'one', 'our', 'out', 'you', 'will'}\n",
|
||||
"keywords = [(w, c) for w, c in word_counter.most_common(100) if w not in stop_words and len(w) > 2]\n",
|
||||
"kw_df = pl.DataFrame({\"word\": [w for w,c in keywords[:40]], \"count\": [c for w,c in keywords[:40]]})\n",
|
||||
"stop_words = {\n",
|
||||
" \"the\",\n",
|
||||
" \"a\",\n",
|
||||
" \"an\",\n",
|
||||
" \"and\",\n",
|
||||
" \"or\",\n",
|
||||
" \"of\",\n",
|
||||
" \"to\",\n",
|
||||
" \"in\",\n",
|
||||
" \"with\",\n",
|
||||
" \"for\",\n",
|
||||
" \"on\",\n",
|
||||
" \"at\",\n",
|
||||
" \"by\",\n",
|
||||
" \"is\",\n",
|
||||
" \"it\",\n",
|
||||
" \"from\",\n",
|
||||
" \"as\",\n",
|
||||
" \"be\",\n",
|
||||
" \"this\",\n",
|
||||
" \"that\",\n",
|
||||
" \"are\",\n",
|
||||
" \"was\",\n",
|
||||
" \"has\",\n",
|
||||
" \"have\",\n",
|
||||
" \"not\",\n",
|
||||
" \"but\",\n",
|
||||
" \"all\",\n",
|
||||
" \"can\",\n",
|
||||
" \"had\",\n",
|
||||
" \"her\",\n",
|
||||
" \"his\",\n",
|
||||
" \"one\",\n",
|
||||
" \"our\",\n",
|
||||
" \"out\",\n",
|
||||
" \"you\",\n",
|
||||
" \"will\",\n",
|
||||
"}\n",
|
||||
"keywords = [\n",
|
||||
" (w, c)\n",
|
||||
" for w, c in word_counter.most_common(100)\n",
|
||||
" if w not in stop_words and len(w) > 2\n",
|
||||
"]\n",
|
||||
"kw_df = pl.DataFrame(\n",
|
||||
" {\"word\": [w for w, c in keywords[:40]], \"count\": [c for w, c in keywords[:40]]}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig = px.bar(kw_df.to_pandas(), x=\"count\", y=\"word\", orientation=\"h\",\n",
|
||||
" title=\"Most Common Words in Feature Descriptions\")\n",
|
||||
"fig = px.bar(\n",
|
||||
" kw_df.to_pandas(),\n",
|
||||
" x=\"count\",\n",
|
||||
" y=\"word\",\n",
|
||||
" orientation=\"h\",\n",
|
||||
" title=\"Most Common Words in Feature Descriptions\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=800, yaxis={\"categoryorder\": \"total ascending\"})\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -593767,9 +593936,14 @@
|
|||
" & (pl.col(\"price\") < 3_000_000)\n",
|
||||
").sample(n=min(15_000, len(with_floor)), seed=42)\n",
|
||||
"\n",
|
||||
"fig = px.scatter(scatter_df.to_pandas(), x=\"floorspace_sqm\", y=\"price\",\n",
|
||||
" color=\"property_type\", opacity=0.3,\n",
|
||||
" title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" scatter_df.to_pandas(),\n",
|
||||
" x=\"floorspace_sqm\",\n",
|
||||
" y=\"price\",\n",
|
||||
" color=\"property_type\",\n",
|
||||
" opacity=0.3,\n",
|
||||
" title=\"Price vs Floorspace (sample, capped at £3M / 500sqm)\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=600, xaxis_title=\"Floorspace (sqm)\", yaxis_title=\"Price (£)\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -594739,8 +594913,14 @@
|
|||
" .agg(pl.col(\"price\").median().alias(\"median_price\"), pl.len().alias(\"count\"))\n",
|
||||
" .sort(\"property_type\", \"bedrooms\")\n",
|
||||
")\n",
|
||||
"fig = px.line(bp.to_pandas(), x=\"bedrooms\", y=\"median_price\", color=\"property_type\",\n",
|
||||
" markers=True, title=\"Median Price by Bedrooms and Property Type\")\n",
|
||||
"fig = px.line(\n",
|
||||
" bp.to_pandas(),\n",
|
||||
" x=\"bedrooms\",\n",
|
||||
" y=\"median_price\",\n",
|
||||
" color=\"property_type\",\n",
|
||||
" markers=True,\n",
|
||||
" title=\"Median Price by Bedrooms and Property Type\",\n",
|
||||
")\n",
|
||||
"fig.update_layout(height=450, xaxis_title=\"Bedrooms\", yaxis_title=\"Median Price (£)\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
|
@ -594789,18 +594969,28 @@
|
|||
"print(f\"Total listings: {len(clean):,}\")\n",
|
||||
"print(f\"Outcodes covered: {clean['outcode'].n_unique():,}\")\n",
|
||||
"print(\"\")\n",
|
||||
"print(f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\")\n",
|
||||
"print(f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\")\n",
|
||||
"print(\n",
|
||||
" f\"Price: median £{clean['price'].median():,.0f}, mean £{clean['price'].mean():,.0f}\"\n",
|
||||
")\n",
|
||||
"print(\n",
|
||||
" f\"Bedrooms: median {clean['bedrooms'].median():.0f}, mean {clean['bedrooms'].mean():.1f}\"\n",
|
||||
")\n",
|
||||
"print(\"\")\n",
|
||||
"print(f\"Tenure known: {(len(clean) - clean['tenure'].null_count())/len(clean)*100:.1f}%\")\n",
|
||||
"print(\n",
|
||||
" f\"Tenure known: {(len(clean) - clean['tenure'].null_count()) / len(clean) * 100:.1f}%\"\n",
|
||||
")\n",
|
||||
"print(f\" Freehold: {len(clean.filter(pl.col('tenure') == 'Freehold')):,}\")\n",
|
||||
"print(f\" Leasehold: {len(clean.filter(pl.col('tenure') == 'Leasehold')):,}\")\n",
|
||||
"print(\"\")\n",
|
||||
"print(f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len()/len(clean)*100:.1f}%\")\n",
|
||||
"print(\n",
|
||||
" f\"Floorspace available: {clean['floorspace_sqm'].drop_nulls().len() / len(clean) * 100:.1f}%\"\n",
|
||||
")\n",
|
||||
"print(\"\")\n",
|
||||
"print(\"Property types:\")\n",
|
||||
"for row in clean['property_type'].value_counts().sort('count', descending=True).iter_rows():\n",
|
||||
" print(f\" {row[0]}: {row[1]:,} ({row[1]/len(clean)*100:.1f}%)\")"
|
||||
"for row in (\n",
|
||||
" clean[\"property_type\"].value_counts().sort(\"count\", descending=True).iter_rows()\n",
|
||||
"):\n",
|
||||
" print(f\" {row[0]}: {row[1]:,} ({row[1] / len(clean) * 100:.1f}%)\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue