414 lines
15 KiB
Python
414 lines
15 KiB
Python
"""Augment wide.parquet with an estimated current price column.
|
|
|
|
Joins the precomputed repeat-sales price index (from price_index.py) with each
|
|
property's last known sale to produce an inflation-adjusted current price estimate.
|
|
Uses type-stratified index when available, falling back to "All" type.
|
|
|
|
Optionally applies renovation premiums from renovation_premium.py: for properties
|
|
with post-sale renovation events, the estimated price is adjusted upward based on
|
|
data-driven per-area premiums with time decay.
|
|
|
|
Modifies wide.parquet in-place, adding the "Estimated current price" column.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
|
|
from pipeline.transform._price_utils import (
|
|
CURRENT_YEAR,
|
|
sector_expr,
|
|
type_group_expr,
|
|
)
|
|
|
|
HALF_LIFE = 10.0
|
|
DECAY_RATE = math.log(2) / HALF_LIFE
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Augment wide.parquet with estimated current prices"
|
|
)
|
|
parser.add_argument(
|
|
"--input",
|
|
type=Path,
|
|
required=True,
|
|
help="Path to wide.parquet (modified in-place)",
|
|
)
|
|
parser.add_argument(
|
|
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--renovation-premium",
|
|
type=Path,
|
|
default=None,
|
|
help="Path to renovation_premium.parquet (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--hedonic-model",
|
|
type=Path,
|
|
default=None,
|
|
help="Path to hedonic_model.json (optional)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
print("Loading wide.parquet...")
|
|
df = pl.read_parquet(args.input)
|
|
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
|
|
|
# Drop existing estimated columns if re-running
|
|
for col in ["Estimated current price", "Est. price per sqm"]:
|
|
if col in df.columns:
|
|
df = df.drop(col)
|
|
|
|
# Derive helper columns for the join
|
|
has_price = (
|
|
pl.col("Last known price").is_not_null()
|
|
& pl.col("Postcode").is_not_null()
|
|
& pl.col("Date of last transaction").is_not_null()
|
|
)
|
|
|
|
df = df.with_columns(
|
|
sector_expr().alias("_sector"),
|
|
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
|
|
type_group_expr().alias("_type_group"),
|
|
)
|
|
|
|
index = pl.read_parquet(args.index)
|
|
has_type_group = "type_group" in index.columns
|
|
if has_type_group:
|
|
print(
|
|
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
|
f"{index['type_group'].n_unique()} type groups"
|
|
)
|
|
else:
|
|
print(
|
|
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
|
|
)
|
|
|
|
print("\nApplying repeat-sales index...")
|
|
|
|
if has_type_group:
|
|
idx_typed = index.filter(pl.col("type_group") != "All")
|
|
idx_all = index.filter(pl.col("type_group") == "All")
|
|
|
|
# Join type-specific index at sale year
|
|
df = df.join(
|
|
idx_typed.select(
|
|
"sector",
|
|
"type_group",
|
|
"year",
|
|
pl.col("log_index").alias("log_idx_sale_typed"),
|
|
),
|
|
left_on=["_sector", "_type_group", "_sale_year"],
|
|
right_on=["sector", "type_group", "year"],
|
|
how="left",
|
|
)
|
|
# Join "All" index at sale year
|
|
df = df.join(
|
|
idx_all.select(
|
|
"sector", "year", pl.col("log_index").alias("log_idx_sale_all")
|
|
),
|
|
left_on=["_sector", "_sale_year"],
|
|
right_on=["sector", "year"],
|
|
how="left",
|
|
)
|
|
# Join type-specific index at current year
|
|
df = df.join(
|
|
idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
|
|
"sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
|
|
),
|
|
left_on=["_sector", "_type_group"],
|
|
right_on=["sector", "type_group"],
|
|
how="left",
|
|
)
|
|
# Join "All" index at current year
|
|
df = df.join(
|
|
idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
|
|
"sector", pl.col("log_index").alias("log_idx_cur_all")
|
|
),
|
|
left_on="_sector",
|
|
right_on="sector",
|
|
how="left",
|
|
)
|
|
|
|
df = df.with_columns(
|
|
pl.col("log_idx_sale_typed")
|
|
.fill_null(pl.col("log_idx_sale_all"))
|
|
.alias("_log_index_sale"),
|
|
pl.col("log_idx_cur_typed")
|
|
.fill_null(pl.col("log_idx_cur_all"))
|
|
.alias("_log_index_current"),
|
|
)
|
|
else:
|
|
df = df.join(
|
|
index.select(
|
|
"sector", "year", pl.col("log_index").alias("_log_index_sale")
|
|
),
|
|
left_on=["_sector", "_sale_year"],
|
|
right_on=["sector", "year"],
|
|
how="left",
|
|
)
|
|
index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
|
|
"sector", pl.col("log_index").alias("_log_index_current")
|
|
)
|
|
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
|
|
|
|
# Compute estimate — only for rows with a known price
|
|
df = df.with_columns(
|
|
pl.when(has_price)
|
|
.then(
|
|
pl.col("Last known price").cast(pl.Float64)
|
|
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
|
|
)
|
|
.otherwise(pl.lit(None))
|
|
.alias("Estimated current price"),
|
|
)
|
|
|
|
n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
|
|
n_with_price = df.filter(has_price).height
|
|
print(
|
|
f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
|
|
)
|
|
|
|
# Apply hedonic blending if model provided
|
|
if args.hedonic_model is not None:
|
|
print("\nApplying hedonic blending...")
|
|
with open(args.hedonic_model) as f:
|
|
model = json.load(f)
|
|
type_models = model["type_models"]
|
|
tau = model.get("tau", 15.0)
|
|
print(f" tau = {tau}, {len(type_models)} type models")
|
|
|
|
# Add type_group for per-type lookup
|
|
df = df.with_columns(type_group_expr())
|
|
hedonic_mask = (
|
|
has_price
|
|
& pl.col("Estimated current price").is_not_null()
|
|
& pl.col("Total floor area (sqm)").is_not_null()
|
|
& (pl.col("Total floor area (sqm)") > 0)
|
|
& pl.col("type_group").is_not_null()
|
|
)
|
|
eligible = df.filter(hedonic_mask)
|
|
|
|
if len(eligible) > 0:
|
|
log_fa = np.log(
|
|
np.maximum(
|
|
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
|
|
1.0,
|
|
)
|
|
)
|
|
sectors = eligible["_sector"].to_list()
|
|
types = eligible["type_group"].to_list()
|
|
|
|
# Per-type hedonic prediction
|
|
log_hedonic = np.empty(len(eligible))
|
|
for i in range(len(eligible)):
|
|
tm = type_models.get(types[i])
|
|
if tm is None:
|
|
log_hedonic[i] = np.nan
|
|
continue
|
|
alpha = tm["sector_intercepts"].get(
|
|
sectors[i], tm["national_intercept"]
|
|
)
|
|
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
|
|
|
|
valid = np.isfinite(log_hedonic)
|
|
|
|
# Hold years and blend weight
|
|
sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
|
|
hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
|
|
blend_w = hold_years / (hold_years + tau)
|
|
|
|
# Blend in log space
|
|
log_index_est = np.log(
|
|
eligible["Estimated current price"].to_numpy().astype(np.float64)
|
|
)
|
|
log_blended = np.where(
|
|
valid,
|
|
(1 - blend_w) * log_index_est + blend_w * log_hedonic,
|
|
log_index_est,
|
|
)
|
|
blended_prices = np.exp(log_blended)
|
|
|
|
# Write back into df
|
|
eligible_indices = df.select(hedonic_mask).to_series().arg_true()
|
|
price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
|
|
for i, idx in enumerate(eligible_indices):
|
|
price_arr[idx] = blended_prices[i]
|
|
df = df.with_columns(
|
|
pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
|
|
)
|
|
|
|
n_blended = int(valid.sum())
|
|
avg_w = float(np.mean(blend_w[valid]))
|
|
print(
|
|
f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
|
|
)
|
|
else:
|
|
print(" No eligible properties for hedonic blending")
|
|
|
|
# Apply renovation premiums if provided
|
|
if args.renovation_premium is not None:
|
|
print("\nApplying renovation premiums...")
|
|
reno_prem = pl.read_parquet(args.renovation_premium)
|
|
print(f" Loaded {len(reno_prem):,} premium rows")
|
|
|
|
# Find properties with post-sale renovation events
|
|
has_reno = (
|
|
pl.col("renovation_history").is_not_null()
|
|
& (pl.col("renovation_history").list.len() > 0)
|
|
& pl.col("Estimated current price").is_not_null()
|
|
)
|
|
|
|
# Explode renovation events, filter to post-sale only
|
|
reno_rows = (
|
|
df.lazy()
|
|
.filter(has_reno)
|
|
.select("_sector", "_type_group", "_sale_year", "renovation_history")
|
|
.with_row_index("_row_idx")
|
|
.explode("renovation_history")
|
|
.with_columns(
|
|
pl.col("renovation_history").struct.field("year").alias("_event_year"),
|
|
pl.col("renovation_history").struct.field("event").alias("_event_type"),
|
|
)
|
|
.filter(pl.col("_event_year") > pl.col("_sale_year"))
|
|
.collect()
|
|
)
|
|
|
|
if len(reno_rows) > 0:
|
|
# Take most recent event per (row, event_type)
|
|
latest = (
|
|
reno_rows.lazy()
|
|
.group_by("_row_idx", "_event_type", "_sector", "_type_group")
|
|
.agg(pl.col("_event_year").max().alias("_event_year"))
|
|
.collect()
|
|
)
|
|
|
|
# Compute time-decayed premium
|
|
latest = latest.with_columns(
|
|
(-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
|
|
.exp()
|
|
.alias("_decay"),
|
|
)
|
|
|
|
# Join with renovation_premium.parquet — try typed first, fall back to "All"
|
|
rp_typed = reno_prem.filter(pl.col("type_group") != "All")
|
|
rp_all = reno_prem.filter(pl.col("type_group") == "All")
|
|
|
|
latest = (
|
|
latest.join(
|
|
rp_typed.select(
|
|
"sector",
|
|
"type_group",
|
|
"event_type",
|
|
pl.col("log_premium").alias("_lp_typed"),
|
|
),
|
|
left_on=["_sector", "_type_group", "_event_type"],
|
|
right_on=["sector", "type_group", "event_type"],
|
|
how="left",
|
|
)
|
|
.join(
|
|
rp_all.select(
|
|
"sector", "event_type", pl.col("log_premium").alias("_lp_all")
|
|
),
|
|
left_on=["_sector", "_event_type"],
|
|
right_on=["sector", "event_type"],
|
|
how="left",
|
|
)
|
|
.with_columns(
|
|
pl.col("_lp_typed")
|
|
.fill_null(pl.col("_lp_all"))
|
|
.fill_null(0.0)
|
|
.alias("_log_premium"),
|
|
)
|
|
)
|
|
|
|
# Compute total decayed log premium per property
|
|
per_property = (
|
|
latest.lazy()
|
|
.with_columns(
|
|
(pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
|
|
)
|
|
.group_by("_row_idx")
|
|
.agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
|
|
.collect()
|
|
)
|
|
|
|
# We need to map _row_idx back to the main df. Re-derive the row indices.
|
|
# _row_idx was generated from filtered rows — we need the actual df row indices.
|
|
reno_mask = df.select(has_reno).to_series()
|
|
actual_indices = reno_mask.arg_true()
|
|
|
|
# Build a mapping: _row_idx -> actual df row
|
|
idx_map = per_property.with_columns(
|
|
pl.col("_row_idx")
|
|
.map_elements(
|
|
lambda i: int(actual_indices[i]),
|
|
return_dtype=pl.UInt32,
|
|
)
|
|
.alias("_df_row"),
|
|
)
|
|
|
|
# Create a full-length column of zeros, then fill in premium values
|
|
reno_log_prem = [0.0] * len(df)
|
|
for row in idx_map.iter_rows(named=True):
|
|
reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
|
|
|
|
df = df.with_columns(
|
|
pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
|
|
)
|
|
|
|
# Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
|
|
df = df.with_columns(
|
|
pl.when(pl.col("_reno_log_premium") != 0.0)
|
|
.then(
|
|
pl.col("Estimated current price")
|
|
* pl.col("_reno_log_premium").exp()
|
|
)
|
|
.otherwise(pl.col("Estimated current price"))
|
|
.alias("Estimated current price"),
|
|
)
|
|
|
|
n_with_premium = idx_map.height
|
|
avg_multiplier = math.exp(
|
|
per_property["_reno_log_premium"]
|
|
.filter(per_property["_reno_log_premium"] != 0.0)
|
|
.mean()
|
|
)
|
|
print(f" {n_with_premium:,} properties with renovation premium applied")
|
|
print(
|
|
f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
|
|
)
|
|
else:
|
|
print(" No properties with post-sale renovation events")
|
|
|
|
# Derive estimated price per sqm where both estimated price and floor area exist
|
|
df = df.with_columns(
|
|
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
|
.round(0)
|
|
.cast(pl.Int32)
|
|
.alias("Est. price per sqm"),
|
|
)
|
|
|
|
# Drop all temporary columns
|
|
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
|
# Also drop hedonic-derived column if it was added
|
|
if "type_group" in df.columns:
|
|
temp_cols.append("type_group")
|
|
df = df.drop(temp_cols)
|
|
|
|
df.write_parquet(args.input)
|
|
size_mb = args.input.stat().st_size / (1024 * 1024)
|
|
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
|
print(
|
|
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|