"""Augment wide.parquet with an estimated current price column. Joins the precomputed repeat-sales price index (from price_index.py) with each property's last known sale to produce an inflation-adjusted current price estimate. Uses type-stratified index when available, falling back to "All" type. Optionally applies renovation premiums from renovation_premium.py: for properties with post-sale renovation events, the estimated price is adjusted upward based on data-driven per-area premiums with time decay. Modifies wide.parquet in-place, adding the "Estimated current price" column. """ import argparse import json import math from pathlib import Path import numpy as np import polars as pl from pipeline.transform._price_utils import ( CURRENT_YEAR, sector_expr, type_group_expr, ) HALF_LIFE = 10.0 DECAY_RATE = math.log(2) / HALF_LIFE def main(): parser = argparse.ArgumentParser( description="Augment wide.parquet with estimated current prices" ) parser.add_argument( "--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)", ) parser.add_argument( "--index", type=Path, required=True, help="Path to price_index.parquet" ) parser.add_argument( "--renovation-premium", type=Path, default=None, help="Path to renovation_premium.parquet (optional)", ) parser.add_argument( "--hedonic-model", type=Path, default=None, help="Path to hedonic_model.json (optional)", ) args = parser.parse_args() print("Loading wide.parquet...") df = pl.read_parquet(args.input) print(f" {len(df):,} rows, {len(df.columns)} columns") # Drop existing estimated columns if re-running for col in ["Estimated current price", "Est. price per sqm"]: if col in df.columns: df = df.drop(col) # Derive helper columns for the join has_price = ( pl.col("Last known price").is_not_null() & pl.col("Postcode").is_not_null() & pl.col("Date of last transaction").is_not_null() ) df = df.with_columns( sector_expr().alias("_sector"), pl.col("Date of last transaction").dt.year().alias("_sale_year"), type_group_expr().alias("_type_group"), ) index = pl.read_parquet(args.index) has_type_group = "type_group" in index.columns if has_type_group: print( f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, " f"{index['type_group'].n_unique()} type groups" ) else: print( f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)" ) print("\nApplying repeat-sales index...") if has_type_group: idx_typed = index.filter(pl.col("type_group") != "All") idx_all = index.filter(pl.col("type_group") == "All") # Join type-specific index at sale year df = df.join( idx_typed.select( "sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed"), ), left_on=["_sector", "_type_group", "_sale_year"], right_on=["sector", "type_group", "year"], how="left", ) # Join "All" index at sale year df = df.join( idx_all.select( "sector", "year", pl.col("log_index").alias("log_idx_sale_all") ), left_on=["_sector", "_sale_year"], right_on=["sector", "year"], how="left", ) # Join type-specific index at current year df = df.join( idx_typed.filter(pl.col("year") == CURRENT_YEAR).select( "sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed") ), left_on=["_sector", "_type_group"], right_on=["sector", "type_group"], how="left", ) # Join "All" index at current year df = df.join( idx_all.filter(pl.col("year") == CURRENT_YEAR).select( "sector", pl.col("log_index").alias("log_idx_cur_all") ), left_on="_sector", right_on="sector", how="left", ) df = df.with_columns( pl.col("log_idx_sale_typed") .fill_null(pl.col("log_idx_sale_all")) .alias("_log_index_sale"), pl.col("log_idx_cur_typed") .fill_null(pl.col("log_idx_cur_all")) .alias("_log_index_current"), ) else: df = df.join( index.select( "sector", "year", pl.col("log_index").alias("_log_index_sale") ), left_on=["_sector", "_sale_year"], right_on=["sector", "year"], how="left", ) index_current = index.filter(pl.col("year") == CURRENT_YEAR).select( "sector", pl.col("log_index").alias("_log_index_current") ) df = df.join(index_current, left_on="_sector", right_on="sector", how="left") # Compute estimate — only for rows with a known price df = df.with_columns( pl.when(has_price) .then( pl.col("Last known price").cast(pl.Float64) * (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp() ) .otherwise(pl.lit(None)) .alias("Estimated current price"), ) n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height n_with_price = df.filter(has_price).height print( f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)" ) # Apply hedonic blending if model provided if args.hedonic_model is not None: print("\nApplying hedonic blending...") with open(args.hedonic_model) as f: model = json.load(f) type_models = model["type_models"] tau = model.get("tau", 15.0) print(f" tau = {tau}, {len(type_models)} type models") # Add type_group for per-type lookup df = df.with_columns(type_group_expr()) hedonic_mask = ( has_price & pl.col("Estimated current price").is_not_null() & pl.col("Total floor area (sqm)").is_not_null() & (pl.col("Total floor area (sqm)") > 0) & pl.col("type_group").is_not_null() ) eligible = df.filter(hedonic_mask) if len(eligible) > 0: log_fa = np.log( np.maximum( eligible["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0, ) ) sectors = eligible["_sector"].to_list() types = eligible["type_group"].to_list() # Per-type hedonic prediction log_hedonic = np.empty(len(eligible)) for i in range(len(eligible)): tm = type_models.get(types[i]) if tm is None: log_hedonic[i] = np.nan continue alpha = tm["sector_intercepts"].get( sectors[i], tm["national_intercept"] ) log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha valid = np.isfinite(log_hedonic) # Hold years and blend weight sale_years = eligible["_sale_year"].to_numpy().astype(np.float64) hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0) blend_w = hold_years / (hold_years + tau) # Blend in log space log_index_est = np.log( eligible["Estimated current price"].to_numpy().astype(np.float64) ) log_blended = np.where( valid, (1 - blend_w) * log_index_est + blend_w * log_hedonic, log_index_est, ) blended_prices = np.exp(log_blended) # Write back into df eligible_indices = df.select(hedonic_mask).to_series().arg_true() price_arr = df["Estimated current price"].to_numpy().astype(np.float64) for i, idx in enumerate(eligible_indices): price_arr[idx] = blended_prices[i] df = df.with_columns( pl.Series("Estimated current price", price_arr, dtype=pl.Float64), ) n_blended = int(valid.sum()) avg_w = float(np.mean(blend_w[valid])) print( f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})" ) else: print(" No eligible properties for hedonic blending") # Apply renovation premiums if provided if args.renovation_premium is not None: print("\nApplying renovation premiums...") reno_prem = pl.read_parquet(args.renovation_premium) print(f" Loaded {len(reno_prem):,} premium rows") # Find properties with post-sale renovation events has_reno = ( pl.col("renovation_history").is_not_null() & (pl.col("renovation_history").list.len() > 0) & pl.col("Estimated current price").is_not_null() ) # Explode renovation events, filter to post-sale only reno_rows = ( df.lazy() .filter(has_reno) .select("_sector", "_type_group", "_sale_year", "renovation_history") .with_row_index("_row_idx") .explode("renovation_history") .with_columns( pl.col("renovation_history").struct.field("year").alias("_event_year"), pl.col("renovation_history").struct.field("event").alias("_event_type"), ) .filter(pl.col("_event_year") > pl.col("_sale_year")) .collect() ) if len(reno_rows) > 0: # Take most recent event per (row, event_type) latest = ( reno_rows.lazy() .group_by("_row_idx", "_event_type", "_sector", "_type_group") .agg(pl.col("_event_year").max().alias("_event_year")) .collect() ) # Compute time-decayed premium latest = latest.with_columns( (-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64)) .exp() .alias("_decay"), ) # Join with renovation_premium.parquet — try typed first, fall back to "All" rp_typed = reno_prem.filter(pl.col("type_group") != "All") rp_all = reno_prem.filter(pl.col("type_group") == "All") latest = ( latest.join( rp_typed.select( "sector", "type_group", "event_type", pl.col("log_premium").alias("_lp_typed"), ), left_on=["_sector", "_type_group", "_event_type"], right_on=["sector", "type_group", "event_type"], how="left", ) .join( rp_all.select( "sector", "event_type", pl.col("log_premium").alias("_lp_all") ), left_on=["_sector", "_event_type"], right_on=["sector", "event_type"], how="left", ) .with_columns( pl.col("_lp_typed") .fill_null(pl.col("_lp_all")) .fill_null(0.0) .alias("_log_premium"), ) ) # Compute total decayed log premium per property per_property = ( latest.lazy() .with_columns( (pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"), ) .group_by("_row_idx") .agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium")) .collect() ) # We need to map _row_idx back to the main df. Re-derive the row indices. # _row_idx was generated from filtered rows — we need the actual df row indices. reno_mask = df.select(has_reno).to_series() actual_indices = reno_mask.arg_true() # Build a mapping: _row_idx -> actual df row idx_map = per_property.with_columns( pl.col("_row_idx") .map_elements( lambda i: int(actual_indices[i]), return_dtype=pl.UInt32, ) .alias("_df_row"), ) # Create a full-length column of zeros, then fill in premium values reno_log_prem = [0.0] * len(df) for row in idx_map.iter_rows(named=True): reno_log_prem[row["_df_row"]] = row["_reno_log_premium"] df = df.with_columns( pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64), ) # Apply: multiply estimated price by exp(reno_log_premium) where premium > 0 df = df.with_columns( pl.when(pl.col("_reno_log_premium") != 0.0) .then( pl.col("Estimated current price") * pl.col("_reno_log_premium").exp() ) .otherwise(pl.col("Estimated current price")) .alias("Estimated current price"), ) n_with_premium = idx_map.height avg_multiplier = math.exp( per_property["_reno_log_premium"] .filter(per_property["_reno_log_premium"] != 0.0) .mean() ) print(f" {n_with_premium:,} properties with renovation premium applied") print( f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)" ) else: print(" No properties with post-sale renovation events") # Derive estimated price per sqm where both estimated price and floor area exist df = df.with_columns( (pl.col("Estimated current price") / pl.col("Total floor area (sqm)")) .round(0) .cast(pl.Int32) .alias("Est. price per sqm"), ) # Drop all temporary columns temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")] # Also drop hedonic-derived column if it was added if "type_group" in df.columns: temp_cols.append("type_group") df = df.drop(temp_cols) df.write_parquet(args.input) size_mb = args.input.stat().st_size / (1024 * 1024) print(f"\nWrote {args.input} ({size_mb:.1f} MB)") print( f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')" ) if __name__ == "__main__": main()