perfect-postcode/pipeline/transform/price_estimate.py
2026-02-15 22:39:53 +00:00

414 lines
15 KiB
Python

"""Augment wide.parquet with an estimated current price column.
Joins the precomputed repeat-sales price index (from price_index.py) with each
property's last known sale to produce an inflation-adjusted current price estimate.
Uses type-stratified index when available, falling back to "All" type.
Optionally applies renovation premiums from renovation_premium.py: for properties
with post-sale renovation events, the estimated price is adjusted upward based on
data-driven per-area premiums with time decay.
Modifies wide.parquet in-place, adding the "Estimated current price" column.
"""
import argparse
import json
import math
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform._price_utils import (
CURRENT_YEAR,
sector_expr,
type_group_expr,
)
HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--renovation-premium",
type=Path,
default=None,
help="Path to renovation_premium.parquet (optional)",
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Derive helper columns for the join
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
& pl.col("Date of last transaction").is_not_null()
)
df = df.with_columns(
sector_expr().alias("_sector"),
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
type_group_expr().alias("_type_group"),
)
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
)
print("\nApplying repeat-sales index...")
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at sale year
df = df.join(
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("log_idx_sale_typed"),
),
left_on=["_sector", "_type_group", "_sale_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at sale year
df = df.join(
idx_all.select(
"sector", "year", pl.col("log_index").alias("log_idx_sale_all")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at current year
df = df.join(
idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
),
left_on=["_sector", "_type_group"],
right_on=["sector", "type_group"],
how="left",
)
# Join "All" index at current year
df = df.join(
idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("log_idx_cur_all")
),
left_on="_sector",
right_on="sector",
how="left",
)
df = df.with_columns(
pl.col("log_idx_sale_typed")
.fill_null(pl.col("log_idx_sale_all"))
.alias("_log_index_sale"),
pl.col("log_idx_cur_typed")
.fill_null(pl.col("log_idx_cur_all"))
.alias("_log_index_current"),
)
else:
df = df.join(
index.select(
"sector", "year", pl.col("log_index").alias("_log_index_sale")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("_log_index_current")
)
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
# Compute estimate — only for rows with a known price
df = df.with_columns(
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
n_with_price = df.filter(has_price).height
print(
f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
)
# Apply hedonic blending if model provided
if args.hedonic_model is not None:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
tau = model.get("tau", 15.0)
print(f" tau = {tau}, {len(type_models)} type models")
# Add type_group for per-type lookup
df = df.with_columns(type_group_expr())
hedonic_mask = (
has_price
& pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible = df.filter(hedonic_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["_sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years and blend weight
sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
blend_w = hold_years / (hold_years + tau)
# Blend in log space
log_index_est = np.log(
eligible["Estimated current price"].to_numpy().astype(np.float64)
)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_est + blend_w * log_hedonic,
log_index_est,
)
blended_prices = np.exp(log_blended)
# Write back into df
eligible_indices = df.select(hedonic_mask).to_series().arg_true()
price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
for i, idx in enumerate(eligible_indices):
price_arr[idx] = blended_prices[i]
df = df.with_columns(
pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
)
n_blended = int(valid.sum())
avg_w = float(np.mean(blend_w[valid]))
print(
f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
)
else:
print(" No eligible properties for hedonic blending")
# Apply renovation premiums if provided
if args.renovation_premium is not None:
print("\nApplying renovation premiums...")
reno_prem = pl.read_parquet(args.renovation_premium)
print(f" Loaded {len(reno_prem):,} premium rows")
# Find properties with post-sale renovation events
has_reno = (
pl.col("renovation_history").is_not_null()
& (pl.col("renovation_history").list.len() > 0)
& pl.col("Estimated current price").is_not_null()
)
# Explode renovation events, filter to post-sale only
reno_rows = (
df.lazy()
.filter(has_reno)
.select("_sector", "_type_group", "_sale_year", "renovation_history")
.with_row_index("_row_idx")
.explode("renovation_history")
.with_columns(
pl.col("renovation_history").struct.field("year").alias("_event_year"),
pl.col("renovation_history").struct.field("event").alias("_event_type"),
)
.filter(pl.col("_event_year") > pl.col("_sale_year"))
.collect()
)
if len(reno_rows) > 0:
# Take most recent event per (row, event_type)
latest = (
reno_rows.lazy()
.group_by("_row_idx", "_event_type", "_sector", "_type_group")
.agg(pl.col("_event_year").max().alias("_event_year"))
.collect()
)
# Compute time-decayed premium
latest = latest.with_columns(
(-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
.exp()
.alias("_decay"),
)
# Join with renovation_premium.parquet — try typed first, fall back to "All"
rp_typed = reno_prem.filter(pl.col("type_group") != "All")
rp_all = reno_prem.filter(pl.col("type_group") == "All")
latest = (
latest.join(
rp_typed.select(
"sector",
"type_group",
"event_type",
pl.col("log_premium").alias("_lp_typed"),
),
left_on=["_sector", "_type_group", "_event_type"],
right_on=["sector", "type_group", "event_type"],
how="left",
)
.join(
rp_all.select(
"sector", "event_type", pl.col("log_premium").alias("_lp_all")
),
left_on=["_sector", "_event_type"],
right_on=["sector", "event_type"],
how="left",
)
.with_columns(
pl.col("_lp_typed")
.fill_null(pl.col("_lp_all"))
.fill_null(0.0)
.alias("_log_premium"),
)
)
# Compute total decayed log premium per property
per_property = (
latest.lazy()
.with_columns(
(pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
)
.group_by("_row_idx")
.agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
.collect()
)
# We need to map _row_idx back to the main df. Re-derive the row indices.
# _row_idx was generated from filtered rows — we need the actual df row indices.
reno_mask = df.select(has_reno).to_series()
actual_indices = reno_mask.arg_true()
# Build a mapping: _row_idx -> actual df row
idx_map = per_property.with_columns(
pl.col("_row_idx")
.map_elements(
lambda i: int(actual_indices[i]),
return_dtype=pl.UInt32,
)
.alias("_df_row"),
)
# Create a full-length column of zeros, then fill in premium values
reno_log_prem = [0.0] * len(df)
for row in idx_map.iter_rows(named=True):
reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
df = df.with_columns(
pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
)
# Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
df = df.with_columns(
pl.when(pl.col("_reno_log_premium") != 0.0)
.then(
pl.col("Estimated current price")
* pl.col("_reno_log_premium").exp()
)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
)
n_with_premium = idx_map.height
avg_multiplier = math.exp(
per_property["_reno_log_premium"]
.filter(per_property["_reno_log_premium"] != 0.0)
.mean()
)
print(f" {n_with_premium:,} properties with renovation premium applied")
print(
f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
)
else:
print(" No properties with post-sale renovation events")
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32)
.alias("Est. price per sqm"),
)
# Drop all temporary columns
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
# Also drop hedonic-derived column if it was added
if "type_group" in df.columns:
temp_cols.append("type_group")
df = df.drop(temp_cols)
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)
if __name__ == "__main__":
main()