perfect-postcode/pipeline/transform/price_estimate.py

"""Augment wide.parquet with an estimated current price column.

Joins the precomputed repeat-sales price index (from price_index.py) with each
property's last known sale to produce an inflation-adjusted current price estimate.
Uses type-stratified index when available, falling back to "All" type.

Optionally applies renovation premiums from renovation_premium.py: for properties
with post-sale renovation events, the estimated price is adjusted upward based on
data-driven per-area premiums with time decay.

Modifies wide.parquet in-place, adding the "Estimated current price" column.
"""

import argparse
import json
import math
from pathlib import Path

import numpy as np
import polars as pl

from pipeline.transform._price_utils import (
    CURRENT_YEAR,
    sector_expr,
    type_group_expr,
)

HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE


def main():
    parser = argparse.ArgumentParser(
        description="Augment wide.parquet with estimated current prices"
    )
    parser.add_argument(
        "--input",
        type=Path,
        required=True,
        help="Path to wide.parquet (modified in-place)",
    )
    parser.add_argument(
        "--index", type=Path, required=True, help="Path to price_index.parquet"
    )
    parser.add_argument(
        "--renovation-premium",
        type=Path,
        default=None,
        help="Path to renovation_premium.parquet (optional)",
    )
    parser.add_argument(
        "--hedonic-model",
        type=Path,
        default=None,
        help="Path to hedonic_model.json (optional)",
    )
    args = parser.parse_args()

    print("Loading wide.parquet...")
    df = pl.read_parquet(args.input)
    print(f"  {len(df):,} rows, {len(df.columns)} columns")

    # Drop existing estimated columns if re-running
    for col in ["Estimated current price", "Est. price per sqm"]:
        if col in df.columns:
            df = df.drop(col)

    # Derive helper columns for the join
    has_price = (
        pl.col("Last known price").is_not_null()
        & pl.col("Postcode").is_not_null()
        & pl.col("Date of last transaction").is_not_null()
    )

    df = df.with_columns(
        sector_expr().alias("_sector"),
        pl.col("Date of last transaction").dt.year().alias("_sale_year"),
        type_group_expr().alias("_type_group"),
    )

    index = pl.read_parquet(args.index)
    has_type_group = "type_group" in index.columns
    if has_type_group:
        print(
            f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
            f"{index['type_group'].n_unique()} type groups"
        )
    else:
        print(
            f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
        )

    print("\nApplying repeat-sales index...")

    if has_type_group:
        idx_typed = index.filter(pl.col("type_group") != "All")
        idx_all = index.filter(pl.col("type_group") == "All")

        # Join type-specific index at sale year
        df = df.join(
            idx_typed.select(
                "sector",
                "type_group",
                "year",
                pl.col("log_index").alias("log_idx_sale_typed"),
            ),
            left_on=["_sector", "_type_group", "_sale_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
        )
        # Join "All" index at sale year
        df = df.join(
            idx_all.select(
                "sector", "year", pl.col("log_index").alias("log_idx_sale_all")
            ),
            left_on=["_sector", "_sale_year"],
            right_on=["sector", "year"],
            how="left",
        )
        # Join type-specific index at current year
        df = df.join(
            idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
                "sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
            ),
            left_on=["_sector", "_type_group"],
            right_on=["sector", "type_group"],
            how="left",
        )
        # Join "All" index at current year
        df = df.join(
            idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
                "sector", pl.col("log_index").alias("log_idx_cur_all")
            ),
            left_on="_sector",
            right_on="sector",
            how="left",
        )

        df = df.with_columns(
            pl.col("log_idx_sale_typed")
            .fill_null(pl.col("log_idx_sale_all"))
            .alias("_log_index_sale"),
            pl.col("log_idx_cur_typed")
            .fill_null(pl.col("log_idx_cur_all"))
            .alias("_log_index_current"),
        )
    else:
        df = df.join(
            index.select(
                "sector", "year", pl.col("log_index").alias("_log_index_sale")
            ),
            left_on=["_sector", "_sale_year"],
            right_on=["sector", "year"],
            how="left",
        )
        index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
            "sector", pl.col("log_index").alias("_log_index_current")
        )
        df = df.join(index_current, left_on="_sector", right_on="sector", how="left")

    # Compute estimate — only for rows with a known price
    df = df.with_columns(
        pl.when(has_price)
        .then(
            pl.col("Last known price").cast(pl.Float64)
            * (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
        )
        .otherwise(pl.lit(None))
        .alias("Estimated current price"),
    )

    n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
    n_with_price = df.filter(has_price).height
    print(
        f"  {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
    )

    # Apply hedonic blending if model provided
    if args.hedonic_model is not None:
        print("\nApplying hedonic blending...")
        with open(args.hedonic_model) as f:
            model = json.load(f)
        type_models = model["type_models"]
        tau = model.get("tau", 15.0)
        print(f"  tau = {tau}, {len(type_models)} type models")

        # Add type_group for per-type lookup
        df = df.with_columns(type_group_expr())
        hedonic_mask = (
            has_price
            & pl.col("Estimated current price").is_not_null()
            & pl.col("Total floor area (sqm)").is_not_null()
            & (pl.col("Total floor area (sqm)") > 0)
            & pl.col("type_group").is_not_null()
        )
        eligible = df.filter(hedonic_mask)

        if len(eligible) > 0:
            log_fa = np.log(
                np.maximum(
                    eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
                    1.0,
                )
            )
            sectors = eligible["_sector"].to_list()
            types = eligible["type_group"].to_list()

            # Per-type hedonic prediction
            log_hedonic = np.empty(len(eligible))
            for i in range(len(eligible)):
                tm = type_models.get(types[i])
                if tm is None:
                    log_hedonic[i] = np.nan
                    continue
                alpha = tm["sector_intercepts"].get(
                    sectors[i], tm["national_intercept"]
                )
                log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha

            valid = np.isfinite(log_hedonic)

            # Hold years and blend weight
            sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
            hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
            blend_w = hold_years / (hold_years + tau)

            # Blend in log space
            log_index_est = np.log(
                eligible["Estimated current price"].to_numpy().astype(np.float64)
            )
            log_blended = np.where(
                valid,
                (1 - blend_w) * log_index_est + blend_w * log_hedonic,
                log_index_est,
            )
            blended_prices = np.exp(log_blended)

            # Write back into df
            eligible_indices = df.select(hedonic_mask).to_series().arg_true()
            price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
            for i, idx in enumerate(eligible_indices):
                price_arr[idx] = blended_prices[i]
            df = df.with_columns(
                pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
            )

            n_blended = int(valid.sum())
            avg_w = float(np.mean(blend_w[valid]))
            print(
                f"  {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
            )
        else:
            print("  No eligible properties for hedonic blending")

    # Apply renovation premiums if provided
    if args.renovation_premium is not None:
        print("\nApplying renovation premiums...")
        reno_prem = pl.read_parquet(args.renovation_premium)
        print(f"  Loaded {len(reno_prem):,} premium rows")

        # Find properties with post-sale renovation events
        has_reno = (
            pl.col("renovation_history").is_not_null()
            & (pl.col("renovation_history").list.len() > 0)
            & pl.col("Estimated current price").is_not_null()
        )

        # Explode renovation events, filter to post-sale only
        reno_rows = (
            df.lazy()
            .filter(has_reno)
            .select("_sector", "_type_group", "_sale_year", "renovation_history")
            .with_row_index("_row_idx")
            .explode("renovation_history")
            .with_columns(
                pl.col("renovation_history").struct.field("year").alias("_event_year"),
                pl.col("renovation_history").struct.field("event").alias("_event_type"),
            )
            .filter(pl.col("_event_year") > pl.col("_sale_year"))
            .collect()
        )

        if len(reno_rows) > 0:
            # Take most recent event per (row, event_type)
            latest = (
                reno_rows.lazy()
                .group_by("_row_idx", "_event_type", "_sector", "_type_group")
                .agg(pl.col("_event_year").max().alias("_event_year"))
                .collect()
            )

            # Compute time-decayed premium
            latest = latest.with_columns(
                (-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
                .exp()
                .alias("_decay"),
            )

            # Join with renovation_premium.parquet — try typed first, fall back to "All"
            rp_typed = reno_prem.filter(pl.col("type_group") != "All")
            rp_all = reno_prem.filter(pl.col("type_group") == "All")

            latest = (
                latest.join(
                    rp_typed.select(
                        "sector",
                        "type_group",
                        "event_type",
                        pl.col("log_premium").alias("_lp_typed"),
                    ),
                    left_on=["_sector", "_type_group", "_event_type"],
                    right_on=["sector", "type_group", "event_type"],
                    how="left",
                )
                .join(
                    rp_all.select(
                        "sector", "event_type", pl.col("log_premium").alias("_lp_all")
                    ),
                    left_on=["_sector", "_event_type"],
                    right_on=["sector", "event_type"],
                    how="left",
                )
                .with_columns(
                    pl.col("_lp_typed")
                    .fill_null(pl.col("_lp_all"))
                    .fill_null(0.0)
                    .alias("_log_premium"),
                )
            )

            # Compute total decayed log premium per property
            per_property = (
                latest.lazy()
                .with_columns(
                    (pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
                )
                .group_by("_row_idx")
                .agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
                .collect()
            )

            # We need to map _row_idx back to the main df. Re-derive the row indices.
            # _row_idx was generated from filtered rows — we need the actual df row indices.
            reno_mask = df.select(has_reno).to_series()
            actual_indices = reno_mask.arg_true()

            # Build a mapping: _row_idx -> actual df row
            idx_map = per_property.with_columns(
                pl.col("_row_idx")
                .map_elements(
                    lambda i: int(actual_indices[i]),
                    return_dtype=pl.UInt32,
                )
                .alias("_df_row"),
            )

            # Create a full-length column of zeros, then fill in premium values
            reno_log_prem = [0.0] * len(df)
            for row in idx_map.iter_rows(named=True):
                reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]

            df = df.with_columns(
                pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
            )

            # Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
            df = df.with_columns(
                pl.when(pl.col("_reno_log_premium") != 0.0)
                .then(
                    pl.col("Estimated current price")
                    * pl.col("_reno_log_premium").exp()
                )
                .otherwise(pl.col("Estimated current price"))
                .alias("Estimated current price"),
            )

            n_with_premium = idx_map.height
            avg_multiplier = math.exp(
                per_property["_reno_log_premium"]
                .filter(per_property["_reno_log_premium"] != 0.0)
                .mean()
            )
            print(f"  {n_with_premium:,} properties with renovation premium applied")
            print(
                f"  Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
            )
        else:
            print("  No properties with post-sale renovation events")

    # Derive estimated price per sqm where both estimated price and floor area exist
    df = df.with_columns(
        (pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
        .round(0)
        .cast(pl.Int32)
        .alias("Est. price per sqm"),
    )

    # Drop all temporary columns
    temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
    # Also drop hedonic-derived column if it was added
    if "type_group" in df.columns:
        temp_cols.append("type_group")
    df = df.drop(temp_cols)

    df.write_parquet(args.input)
    size_mb = args.input.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
    print(
        f"  {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
    )


if __name__ == "__main__":
    main()