Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -28,8 +28,8 @@
|
|||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pd.set_option('display.max_columns', None)\n",
|
||||
"pd.set_option('display.max_colwidth', 60)"
|
||||
"pd.set_option(\"display.max_columns\", None)\n",
|
||||
"pd.set_option(\"display.max_colwidth\", 60)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -47,7 +47,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"param_import_path = '/bulk/wide-2.parquet'\n",
|
||||
"param_import_path = \"/bulk/wide-2.parquet\"\n",
|
||||
"\n",
|
||||
"param_lookback = 3"
|
||||
]
|
||||
|
|
@ -128,7 +128,7 @@
|
|||
],
|
||||
"source": [
|
||||
"data = pl.scan_parquet(param_import_path).unique(subset=[\"Postcode\", \"Address per EPC\"])\n",
|
||||
"data = data.filter(pl.col('Total floor area (sqm)') > 10)\n",
|
||||
"data = data.filter(pl.col(\"Total floor area (sqm)\") > 10)\n",
|
||||
"\n",
|
||||
"# print(data.collect_schema()) # column names and types\n",
|
||||
"print(data.select(pl.len()).collect()) # row count\n",
|
||||
|
|
@ -145,22 +145,20 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"columns_required = [\n",
|
||||
" # absolute neccesity \n",
|
||||
" 'Postcode',\n",
|
||||
" 'Address per EPC',\n",
|
||||
" 'historical_prices',\n",
|
||||
" 'Price per sqm',\n",
|
||||
"\n",
|
||||
" # absolute neccesity\n",
|
||||
" \"Postcode\",\n",
|
||||
" \"Address per EPC\",\n",
|
||||
" \"historical_prices\",\n",
|
||||
" \"Price per sqm\",\n",
|
||||
" # faily fixed attributes\n",
|
||||
" 'Property type', # or 'epc_property_type' or 'built_form'\n",
|
||||
" 'Leashold/Freehold',\n",
|
||||
" 'Total floor area (sqm)',\n",
|
||||
" 'Rooms (including bedrooms & bathrooms)',\n",
|
||||
" 'Approximate construction age',\n",
|
||||
"\n",
|
||||
" \"Property type\", # or 'epc_property_type' or 'built_form'\n",
|
||||
" \"Leashold/Freehold\",\n",
|
||||
" \"Total floor area (sqm)\",\n",
|
||||
" \"Rooms (including bedrooms & bathrooms)\",\n",
|
||||
" \"Approximate construction age\",\n",
|
||||
" # latest\n",
|
||||
" # 'date_of_transfer'\n",
|
||||
" 'Last known price'\n",
|
||||
" \"Last known price\",\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
|
|
@ -440,8 +438,13 @@
|
|||
],
|
||||
"source": [
|
||||
"# temp_Postcodes = [\"LE5 4ED\", \"E14 9GU\", \"YO8 9PW\", \"SW1P 3AN\", \"BH3 7DX\", \"E14 2DG\"]\n",
|
||||
"temp_Postcodes = data.select('Postcode').collect().sample(10000)['Postcode'].to_list()\n",
|
||||
"data_small = data.filter(pl.col(\"Postcode\").is_in(temp_Postcodes)).select(columns_required).collect().to_pandas()\n",
|
||||
"temp_Postcodes = data.select(\"Postcode\").collect().sample(10000)[\"Postcode\"].to_list()\n",
|
||||
"data_small = (\n",
|
||||
" data.filter(pl.col(\"Postcode\").is_in(temp_Postcodes))\n",
|
||||
" .select(columns_required)\n",
|
||||
" .collect()\n",
|
||||
" .to_pandas()\n",
|
||||
")\n",
|
||||
"data_small = data_small.explode(\"historical_prices\")\n",
|
||||
"data_small[\"year\"] = data_small[\"historical_prices\"].apply(lambda x: x[\"year\"])\n",
|
||||
"data_small[\"price\"] = data_small[\"historical_prices\"].apply(lambda x: x[\"price\"])\n",
|
||||
|
|
@ -458,7 +461,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# data_small[\n",
|
||||
"# (data_small['Postcode'] == 'E14 2DG') \n",
|
||||
"# (data_small['Postcode'] == 'E14 2DG')\n",
|
||||
"# & data_small['epc_address'].str.contains('76')\n",
|
||||
"# ]"
|
||||
]
|
||||
|
|
@ -908,35 +911,45 @@
|
|||
"from typing import Any\n",
|
||||
"from pandas.core.frame import DataFrame\n",
|
||||
"\n",
|
||||
"print(f'rolling periods (relative): {[i for i in range(-param_lookback, 1)]}')\n",
|
||||
"print(f\"rolling periods (relative): {[i for i in range(-param_lookback, 1)]}\")\n",
|
||||
"\n",
|
||||
"# Rolling average (±2 year), weighted by number of sales per year\n",
|
||||
"pc_avg_raw = data_small.groupby(['Postcode', 'year']).agg(\n",
|
||||
" ppsqm_sum=('Price per sqm', 'sum'),\n",
|
||||
" ppsqm_count=('Price per sqm', 'count')\n",
|
||||
").reset_index().sort_values(by=['Postcode', 'year'], ascending=False)\n",
|
||||
"pc_avg_raw = (\n",
|
||||
" data_small.groupby([\"Postcode\", \"year\"])\n",
|
||||
" .agg(ppsqm_sum=(\"Price per sqm\", \"sum\"), ppsqm_count=(\"Price per sqm\", \"count\"))\n",
|
||||
" .reset_index()\n",
|
||||
" .sort_values(by=[\"Postcode\", \"year\"], ascending=False)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"display(pc_avg_raw)\n",
|
||||
"\n",
|
||||
"# Each year's totals contribute to year-1, year, and year+1\n",
|
||||
"pc_avg_expanded = pd.concat([\n",
|
||||
" pc_avg_raw.assign(year=pc_avg_raw['year'] + offset) for offset in range(-param_lookback, 1) # \n",
|
||||
"])\n",
|
||||
"pc_avg_expanded = pd.concat(\n",
|
||||
" [\n",
|
||||
" pc_avg_raw.assign(year=pc_avg_raw[\"year\"] + offset)\n",
|
||||
" for offset in range(-param_lookback, 1) #\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"display(pc_avg_expanded)\n",
|
||||
"\n",
|
||||
"# Sum counts and sums, then divide to get weighted mean\n",
|
||||
"pc_avg_complex = pc_avg_expanded.groupby(['Postcode', 'year']).agg(\n",
|
||||
" ppsqm_sum=('ppsqm_sum', 'sum'),\n",
|
||||
" ppsqm_count=('ppsqm_count', 'sum')\n",
|
||||
").reset_index()\n",
|
||||
"pc_avg_complex['Price per sqm PC AVG'] = pc_avg_complex['ppsqm_sum'] / pc_avg_complex['ppsqm_count']\n",
|
||||
"pc_avg_complex: Any | DataFrame = pc_avg_complex[['Postcode', 'year', 'Price per sqm PC AVG']].sort_values(by=['Postcode', 'year'], ascending=False)\n",
|
||||
"pc_avg_complex = (\n",
|
||||
" pc_avg_expanded.groupby([\"Postcode\", \"year\"])\n",
|
||||
" .agg(ppsqm_sum=(\"ppsqm_sum\", \"sum\"), ppsqm_count=(\"ppsqm_count\", \"sum\"))\n",
|
||||
" .reset_index()\n",
|
||||
")\n",
|
||||
"pc_avg_complex[\"Price per sqm PC AVG\"] = (\n",
|
||||
" pc_avg_complex[\"ppsqm_sum\"] / pc_avg_complex[\"ppsqm_count\"]\n",
|
||||
")\n",
|
||||
"pc_avg_complex: Any | DataFrame = pc_avg_complex[\n",
|
||||
" [\"Postcode\", \"year\", \"Price per sqm PC AVG\"]\n",
|
||||
"].sort_values(by=[\"Postcode\", \"year\"], ascending=False)\n",
|
||||
"display(pc_avg_complex)\n",
|
||||
"\n",
|
||||
"temp_df = pc_avg_complex[pc_avg_complex['Postcode'] == data_small['Postcode'].iloc[0]]\n",
|
||||
"print(data_small['Postcode'].iloc[0])\n",
|
||||
"temp_df.plot.line(x='year', y='Price per sqm PC AVG')"
|
||||
"temp_df = pc_avg_complex[pc_avg_complex[\"Postcode\"] == data_small[\"Postcode\"].iloc[0]]\n",
|
||||
"print(data_small[\"Postcode\"].iloc[0])\n",
|
||||
"temp_df.plot.line(x=\"year\", y=\"Price per sqm PC AVG\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1111,9 +1124,13 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"data_small = data_small.merge(pc_avg_complex, on=['Postcode', 'year'], suffixes=('', ' pc_avg_complex'))\n",
|
||||
"data_small['c'] = data_small['Price per sqm'] / data_small['Price per sqm PC AVG']\n",
|
||||
"data_small[['Postcode', 'Address per EPC', 'Price per sqm', 'Price per sqm PC AVG', 'c']]"
|
||||
"data_small = data_small.merge(\n",
|
||||
" pc_avg_complex, on=[\"Postcode\", \"year\"], suffixes=(\"\", \" pc_avg_complex\")\n",
|
||||
")\n",
|
||||
"data_small[\"c\"] = data_small[\"Price per sqm\"] / data_small[\"Price per sqm PC AVG\"]\n",
|
||||
"data_small[\n",
|
||||
" [\"Postcode\", \"Address per EPC\", \"Price per sqm\", \"Price per sqm PC AVG\", \"c\"]\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1445,17 +1462,21 @@
|
|||
],
|
||||
"source": [
|
||||
"# 1. Coefficient of Variation (std/mean) per property, filtered to 3+ sales\n",
|
||||
"c_stats = data_small.groupby(['Postcode', 'Address per EPC']).agg(\n",
|
||||
" n_sales=('c', 'count'),\n",
|
||||
" year_min=('year', 'min'),\n",
|
||||
" year_max=('year', 'max'),\n",
|
||||
" c_mean=('c', 'mean'),\n",
|
||||
" c_std=('c', 'std'),\n",
|
||||
").dropna()\n",
|
||||
"c_stats['c_cv'] = c_stats['c_std'] / c_stats['c_mean']\n",
|
||||
"c_stats = (\n",
|
||||
" data_small.groupby([\"Postcode\", \"Address per EPC\"])\n",
|
||||
" .agg(\n",
|
||||
" n_sales=(\"c\", \"count\"),\n",
|
||||
" year_min=(\"year\", \"min\"),\n",
|
||||
" year_max=(\"year\", \"max\"),\n",
|
||||
" c_mean=(\"c\", \"mean\"),\n",
|
||||
" c_std=(\"c\", \"std\"),\n",
|
||||
" )\n",
|
||||
" .dropna()\n",
|
||||
")\n",
|
||||
"c_stats[\"c_cv\"] = c_stats[\"c_std\"] / c_stats[\"c_mean\"]\n",
|
||||
"# c_stats_3plus = c_stats[c_stats['n_sales'] >= 3]\n",
|
||||
"# print(f\"Properties with 3+ sales: {len(c_stats_3plus)} / {len(c_stats)}\")\n",
|
||||
"c_stats.sort_values('c_cv', ascending=False).head(20)"
|
||||
"c_stats.sort_values(\"c_cv\", ascending=False).head(20)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -2265,42 +2286,44 @@
|
|||
"display(random_c)\n",
|
||||
"\n",
|
||||
"# pc avg trend\n",
|
||||
"temp_pc_avg = pc_avg_complex[pc_avg_complex['Postcode'] == random_c.index[0][0]].sort_values(by='year')\n",
|
||||
"temp_pc_avg = pc_avg_complex[\n",
|
||||
" pc_avg_complex[\"Postcode\"] == random_c.index[0][0]\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_pc_avg)\n",
|
||||
"\n",
|
||||
"# c for specific address\n",
|
||||
"temp_postcode = data_small[\n",
|
||||
" (data_small['Postcode'] == random_c.index[0][0]) \n",
|
||||
" # & (data_small['Address per EPC'] == random_c.index[0][1]) \n",
|
||||
"].sort_values(by='year')\n",
|
||||
" (data_small[\"Postcode\"] == random_c.index[0][0])\n",
|
||||
" # & (data_small['Address per EPC'] == random_c.index[0][1])\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_postcode)\n",
|
||||
"\n",
|
||||
"temp_address = data_small[\n",
|
||||
" (data_small['Postcode'] == random_c.index[0][0]) \n",
|
||||
" & (data_small['Address per EPC'] == random_c.index[0][1]) \n",
|
||||
"].sort_values(by='year')\n",
|
||||
" (data_small[\"Postcode\"] == random_c.index[0][0])\n",
|
||||
" & (data_small[\"Address per EPC\"] == random_c.index[0][1])\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_address)\n",
|
||||
"\n",
|
||||
"# plot\n",
|
||||
"\n",
|
||||
"fig, ax1 = plt.subplots()\n",
|
||||
"\n",
|
||||
"temp_pc_avg.plot.line(x='year', y='Price per sqm PC AVG', ax=ax1, color='black')\n",
|
||||
"temp_address.plot.line(x='year', y='Price per sqm', ax=ax1, color='green') \n",
|
||||
"temp_pc_avg.plot.line(x=\"year\", y=\"Price per sqm PC AVG\", ax=ax1, color=\"black\")\n",
|
||||
"temp_address.plot.line(x=\"year\", y=\"Price per sqm\", ax=ax1, color=\"green\")\n",
|
||||
"\n",
|
||||
"ax2 = ax1.twinx()\n",
|
||||
"ax2.set_ylim(0, 3)\n",
|
||||
"\n",
|
||||
"for property in temp_postcode['Address per EPC'].unique():\n",
|
||||
" property_data = temp_postcode[temp_postcode['Address per EPC'] == property]\n",
|
||||
" property_data.plot.line(x='year', y='c', ax=ax2, color='orange', style=':')\n",
|
||||
"for property in temp_postcode[\"Address per EPC\"].unique():\n",
|
||||
" property_data = temp_postcode[temp_postcode[\"Address per EPC\"] == property]\n",
|
||||
" property_data.plot.line(x=\"year\", y=\"c\", ax=ax2, color=\"orange\", style=\":\")\n",
|
||||
"\n",
|
||||
"temp_address.plot.line(x='year', y='c', ax=ax2, color='red', style=':')\n",
|
||||
"temp_address.plot.line(x=\"year\", y=\"c\", ax=ax2, color=\"red\", style=\":\")\n",
|
||||
"\n",
|
||||
"ax1.set_ylabel('Price per sqm')\n",
|
||||
"ax2.set_ylabel('c')\n",
|
||||
"ax1.set_ylabel(\"Price per sqm\")\n",
|
||||
"ax2.set_ylabel(\"c\")\n",
|
||||
"\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -2640,17 +2663,21 @@
|
|||
],
|
||||
"source": [
|
||||
"# 1. Coefficient of Variation (std/mean) per property, filtered to 3+ sales\n",
|
||||
"c_stats = data_small.groupby(['Postcode', 'Address per EPC']).agg(\n",
|
||||
" n_sales=('c', 'count'),\n",
|
||||
" year_min=('year', 'min'),\n",
|
||||
" year_max=('year', 'max'),\n",
|
||||
" c_mean=('c', 'mean'),\n",
|
||||
" c_std=('c', 'std'),\n",
|
||||
").dropna()\n",
|
||||
"c_stats['c_cv'] = c_stats['c_std'] / c_stats['c_mean']\n",
|
||||
"c_stats = (\n",
|
||||
" data_small.groupby([\"Postcode\", \"Address per EPC\"])\n",
|
||||
" .agg(\n",
|
||||
" n_sales=(\"c\", \"count\"),\n",
|
||||
" year_min=(\"year\", \"min\"),\n",
|
||||
" year_max=(\"year\", \"max\"),\n",
|
||||
" c_mean=(\"c\", \"mean\"),\n",
|
||||
" c_std=(\"c\", \"std\"),\n",
|
||||
" )\n",
|
||||
" .dropna()\n",
|
||||
")\n",
|
||||
"c_stats[\"c_cv\"] = c_stats[\"c_std\"] / c_stats[\"c_mean\"]\n",
|
||||
"# c_stats_3plus = c_stats[c_stats['n_sales'] >= 3]\n",
|
||||
"# print(f\"Properties with 3+ sales: {len(c_stats_3plus)} / {len(c_stats)}\")\n",
|
||||
"c_stats.sort_values('c_cv', ascending=False).head(20)"
|
||||
"c_stats.sort_values(\"c_cv\", ascending=False).head(20)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -2685,31 +2712,41 @@
|
|||
"\n",
|
||||
"fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
|
||||
"\n",
|
||||
"axes[0].hist(c_stats['c_std'], bins=100, edgecolor='black')\n",
|
||||
"axes[0].set_xlabel('Std of c')\n",
|
||||
"axes[0].set_ylabel('Number of properties')\n",
|
||||
"axes[0].set_title('Distribution of c stability (std)')\n",
|
||||
"axes[0].axvline(x=c_stats['c_std'].median(), color='red', linestyle='--', label=f'Median ({c_stats['c_std'].median()}) threshold')\n",
|
||||
"axes[0].hist(c_stats[\"c_std\"], bins=100, edgecolor=\"black\")\n",
|
||||
"axes[0].set_xlabel(\"Std of c\")\n",
|
||||
"axes[0].set_ylabel(\"Number of properties\")\n",
|
||||
"axes[0].set_title(\"Distribution of c stability (std)\")\n",
|
||||
"axes[0].axvline(\n",
|
||||
" x=c_stats[\"c_std\"].median(),\n",
|
||||
" color=\"red\",\n",
|
||||
" linestyle=\"--\",\n",
|
||||
" label=f\"Median ({c_stats['c_std'].median()}) threshold\",\n",
|
||||
")\n",
|
||||
"axes[0].legend()\n",
|
||||
"\n",
|
||||
"axes[1].hist(c_stats['c_cv'], bins=100, edgecolor='black')\n",
|
||||
"axes[1].set_xlabel('CV of c (std/mean)')\n",
|
||||
"axes[1].set_ylabel('Number of properties')\n",
|
||||
"axes[1].set_title('Distribution of c stability (CV)')\n",
|
||||
"axes[1].axvline(x=c_stats['c_cv'].median(), color='red', linestyle='--', label=f'Median ({c_stats['c_cv'].median()}) threshold')\n",
|
||||
"axes[1].hist(c_stats[\"c_cv\"], bins=100, edgecolor=\"black\")\n",
|
||||
"axes[1].set_xlabel(\"CV of c (std/mean)\")\n",
|
||||
"axes[1].set_ylabel(\"Number of properties\")\n",
|
||||
"axes[1].set_title(\"Distribution of c stability (CV)\")\n",
|
||||
"axes[1].axvline(\n",
|
||||
" x=c_stats[\"c_cv\"].median(),\n",
|
||||
" color=\"red\",\n",
|
||||
" linestyle=\"--\",\n",
|
||||
" label=f\"Median ({c_stats['c_cv'].median()}) threshold\",\n",
|
||||
")\n",
|
||||
"axes[1].legend()\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# output text\n",
|
||||
"pct_stable = (c_stats['c_cv'] < 0.1).mean() * 100\n",
|
||||
"pct_stable = (c_stats[\"c_cv\"] < 0.1).mean() * 100\n",
|
||||
"print(f\"{pct_stable:.1f}% of properties have CV < 0.1\")\n",
|
||||
"\n",
|
||||
"pct_stable = (c_stats['c_cv'] < 0.2).mean() * 100\n",
|
||||
"pct_stable = (c_stats[\"c_cv\"] < 0.2).mean() * 100\n",
|
||||
"print(f\"{pct_stable:.1f}% of properties have CV < 0.2\")\n",
|
||||
"\n",
|
||||
"pct_stable = (c_stats['c_cv'] < 0.3).mean() * 100\n",
|
||||
"pct_stable = (c_stats[\"c_cv\"] < 0.3).mean() * 100\n",
|
||||
"print(f\"{pct_stable:.1f}% of properties have CV < 0.3\")"
|
||||
]
|
||||
},
|
||||
|
|
@ -3299,7 +3336,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"unstable_c = c_stats.sort_values('c_cv', ascending=False)['c_cv'][:20]\n",
|
||||
"unstable_c = c_stats.sort_values(\"c_cv\", ascending=False)[\"c_cv\"][:20]\n",
|
||||
"display(unstable_c)\n",
|
||||
"\n",
|
||||
"unstable_c_specific = random.randint(0, 20)\n",
|
||||
|
|
@ -3308,41 +3345,43 @@
|
|||
"print(unstable_c.index[unstable_c_specific][1])\n",
|
||||
"\n",
|
||||
"# pc avg trend\n",
|
||||
"temp_pc_avg = pc_avg_complex[pc_avg_complex['Postcode'] == unstable_c.index[unstable_c_specific][0]].sort_values(by='year')\n",
|
||||
"temp_pc_avg = pc_avg_complex[\n",
|
||||
" pc_avg_complex[\"Postcode\"] == unstable_c.index[unstable_c_specific][0]\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_pc_avg)\n",
|
||||
"\n",
|
||||
"# c for specific postcode\n",
|
||||
"temp_postcode = data_small[\n",
|
||||
" (data_small['Postcode'] == unstable_c.index[unstable_c_specific][0]) \n",
|
||||
" # & (data_small['Address per EPC'] == unstable_c.index[unstable_c_specific][1]) \n",
|
||||
"].sort_values(by='year')\n",
|
||||
" (data_small[\"Postcode\"] == unstable_c.index[unstable_c_specific][0])\n",
|
||||
" # & (data_small['Address per EPC'] == unstable_c.index[unstable_c_specific][1])\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_address)\n",
|
||||
"\n",
|
||||
"# c for specific address\n",
|
||||
"temp_address = data_small[\n",
|
||||
" (data_small['Postcode'] == unstable_c.index[unstable_c_specific][0]) \n",
|
||||
" & (data_small['Address per EPC'] == unstable_c.index[unstable_c_specific][1]) \n",
|
||||
"].sort_values(by='year')\n",
|
||||
" (data_small[\"Postcode\"] == unstable_c.index[unstable_c_specific][0])\n",
|
||||
" & (data_small[\"Address per EPC\"] == unstable_c.index[unstable_c_specific][1])\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"display(temp_address)\n",
|
||||
"\n",
|
||||
"# plot\n",
|
||||
"\n",
|
||||
"fig, ax1 = plt.subplots()\n",
|
||||
"\n",
|
||||
"temp_pc_avg.plot.line(x='year', y='Price per sqm PC AVG', ax=ax1, color='black')\n",
|
||||
"temp_address.plot.line(x='year', y='Price per sqm', ax=ax1, color='green') \n",
|
||||
"temp_pc_avg.plot.line(x=\"year\", y=\"Price per sqm PC AVG\", ax=ax1, color=\"black\")\n",
|
||||
"temp_address.plot.line(x=\"year\", y=\"Price per sqm\", ax=ax1, color=\"green\")\n",
|
||||
"\n",
|
||||
"ax2 = ax1.twinx()\n",
|
||||
"\n",
|
||||
"for property in temp_postcode['Address per EPC'].unique():\n",
|
||||
" property_data = temp_postcode[temp_postcode['Address per EPC'] == property]\n",
|
||||
" property_data.plot.line(x='year', y='c', ax=ax2, color='orange', style=':')\n",
|
||||
"temp_address.plot.line(x='year', y='c', ax=ax2, color='red', style=':')\n",
|
||||
"for property in temp_postcode[\"Address per EPC\"].unique():\n",
|
||||
" property_data = temp_postcode[temp_postcode[\"Address per EPC\"] == property]\n",
|
||||
" property_data.plot.line(x=\"year\", y=\"c\", ax=ax2, color=\"orange\", style=\":\")\n",
|
||||
"temp_address.plot.line(x=\"year\", y=\"c\", ax=ax2, color=\"red\", style=\":\")\n",
|
||||
"\n",
|
||||
"ax1.set_ylabel('Price per sqm')\n",
|
||||
"ax2.set_ylabel('c')\n",
|
||||
"ax1.set_ylabel(\"Price per sqm\")\n",
|
||||
"ax2.set_ylabel(\"c\")\n",
|
||||
"\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -3370,11 +3409,11 @@
|
|||
],
|
||||
"source": [
|
||||
"# select random address\n",
|
||||
"one_property = data_small.sample(1)[['Postcode', 'Address per EPC']].iloc[0]\n",
|
||||
"postcode = one_property['Postcode']\n",
|
||||
"address = one_property['Address per EPC']\n",
|
||||
"print(f'Postcode: {postcode}')\n",
|
||||
"print(f'Address: {address}')"
|
||||
"one_property = data_small.sample(1)[[\"Postcode\", \"Address per EPC\"]].iloc[0]\n",
|
||||
"postcode = one_property[\"Postcode\"]\n",
|
||||
"address = one_property[\"Address per EPC\"]\n",
|
||||
"print(f\"Postcode: {postcode}\")\n",
|
||||
"print(f\"Address: {address}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -3481,22 +3520,21 @@
|
|||
],
|
||||
"source": [
|
||||
"property_data = data_small[\n",
|
||||
" (data_small['Postcode'] == postcode) \n",
|
||||
" & (data_small['Address per EPC'] == address) \n",
|
||||
" (data_small[\"Postcode\"] == postcode) & (data_small[\"Address per EPC\"] == address)\n",
|
||||
"]\n",
|
||||
"latest_year = property_data['year'].max()\n",
|
||||
"print(f'Latest year of data: {latest_year}')\n",
|
||||
"latest_year = property_data[\"year\"].max()\n",
|
||||
"print(f\"Latest year of data: {latest_year}\")\n",
|
||||
"\n",
|
||||
"# Get only the latest year's data for this property (this is what we want to predict)\n",
|
||||
"data_small_test = property_data[property_data['year'] == latest_year]\n",
|
||||
"data_small_test = property_data[property_data[\"year\"] == latest_year]\n",
|
||||
"\n",
|
||||
"# Remove only the latest year's data from training (keep historical data for this property)\n",
|
||||
"data_small_train = data_small.drop(data_small_test.index)\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"print(f'data_small.shape = {data_small.shape}')\n",
|
||||
"print(f'data_small_train.shape = {data_small_train.shape}')\n",
|
||||
"print(f'data_small_test.shape = {data_small_test.shape}')\n",
|
||||
"print(f\"data_small.shape = {data_small.shape}\")\n",
|
||||
"print(f\"data_small_train.shape = {data_small_train.shape}\")\n",
|
||||
"print(f\"data_small_test.shape = {data_small_test.shape}\")\n",
|
||||
"display(data_small_test)\n",
|
||||
"data_small.shape[0] == data_small_test.shape[0] + data_small_train.shape[0]"
|
||||
]
|
||||
|
|
@ -3607,9 +3645,9 @@
|
|||
"# get latest c in data_small_train\n",
|
||||
"\n",
|
||||
"latest_train_address = data_small_train[\n",
|
||||
" (data_small_train['Postcode'] == postcode) \n",
|
||||
" & (data_small_train['Address per EPC'] == address) \n",
|
||||
"].sort_values(by='year')\n",
|
||||
" (data_small_train[\"Postcode\"] == postcode)\n",
|
||||
" & (data_small_train[\"Address per EPC\"] == address)\n",
|
||||
"].sort_values(by=\"year\")\n",
|
||||
"\n",
|
||||
"latest_train_address"
|
||||
]
|
||||
|
|
@ -3630,10 +3668,10 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"latest_train_c = latest_train_address['c'].iloc[-1]\n",
|
||||
"latest_train_pc_avg = latest_train_address['Price per sqm PC AVG'].iloc[-1]\n",
|
||||
"print(f'Latest c in training data: {latest_train_c:.3f}')\n",
|
||||
"print(f'Latest price per sqm in training data: {latest_train_pc_avg:.2f}') "
|
||||
"latest_train_c = latest_train_address[\"c\"].iloc[-1]\n",
|
||||
"latest_train_pc_avg = latest_train_address[\"Price per sqm PC AVG\"].iloc[-1]\n",
|
||||
"print(f\"Latest c in training data: {latest_train_c:.3f}\")\n",
|
||||
"print(f\"Latest price per sqm in training data: {latest_train_pc_avg:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -3654,7 +3692,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"latest_train_c * latest_train_pc_avg * data_small_test['Total floor area (sqm)'].iloc[0]"
|
||||
"latest_train_c * latest_train_pc_avg * data_small_test[\"Total floor area (sqm)\"].iloc[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue