"""Augment wide.parquet with estimated current prices. For properties with a known prior sale, applies the repeat-sales price index to adjust the last known price to the current date, then blends with kNN estimates from nearby recently-sold properties. Includes: - Capping extreme index adjustments - Seasonal month-of-sale adjustment - kNN spatial blending Modifies wide.parquet in-place. """ import argparse from pathlib import Path import numpy as np import polars as pl from pipeline.transform.price_estimation.knn import ( KNN_BLEND_WEIGHT, build_knn_pool, knn_median_psm, ) from pipeline.transform.price_estimation.utils import ( CURRENT_FRAC_YEAR, CURRENT_MONTH, MAX_LOG_ADJUSTMENT, compute_seasonal_factors, interpolate_log_index, sector_expr, type_group_expr, ) def main(): parser = argparse.ArgumentParser( description="Augment wide.parquet with estimated current prices" ) parser.add_argument( "--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)", ) parser.add_argument( "--index", type=Path, required=True, help="Path to price_index.parquet" ) args = parser.parse_args() print("Loading wide.parquet...") df = pl.read_parquet(args.input) print(f" {len(df):,} rows, {len(df.columns)} columns") # Drop existing estimated columns if re-running for col in ["Estimated current price", "Est. price per sqm"]: if col in df.columns: df = df.drop(col) # Compute seasonal factors seasonal = compute_seasonal_factors(args.input) months = [ "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", ] print( f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}" ) # Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month] sale_month = ( df["Date of last transaction"] .dt.month() .fill_null(6) .to_numpy() .astype(np.int32) ) seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1] # Derive helper columns df = df.with_columns( sector_expr().alias("_sector"), ( pl.col("Date of last transaction").dt.year().cast(pl.Float64) + (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0) / 12.0 ).alias("_sale_frac_year"), type_group_expr().alias("_type_group"), pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"), pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64), ) index = pl.read_parquet(args.index) print( f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, " f"{index['type_group'].n_unique()} type groups" ) print("\nApplying repeat-sales index with fractional year interpolation...") df = interpolate_log_index( index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp" ) df = interpolate_log_index( index, df, "_sector", "_type_group", "_current_frac_year", "_log_index_current_interp", ) # Compute index-adjusted estimate with cap and seasonal adjustment has_price = ( pl.col("Last known price").is_not_null() & pl.col("Postcode").is_not_null() & pl.col("Date of last transaction").is_not_null() ) df = df.with_columns( pl.when(has_price) .then( pl.col("Last known price").cast(pl.Float64) * ( pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp") ) .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT) .exp() * pl.col("_seasonal_adj") ) .otherwise(pl.lit(None)) .alias("Estimated current price"), ) n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height n_with_price = df.filter(has_price).height print( f" {n_estimated:,} of {n_with_price:,} properties estimated " f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)" ) # --- kNN blending --- print("\nBuilding kNN estimates...") trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR) lat = df["lat"].cast(pl.Float64).to_numpy() lon = df["lon"].cast(pl.Float64).to_numpy() tg = df["_type_group"].fill_null("").to_numpy() fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy() knn_psm = knn_median_psm(trees, lat, lon, tg) knn_est = knn_psm * fa # No temporal adj: ref == current df = df.with_columns( pl.Series("_knn_est", knn_est, dtype=pl.Float64), ) # Blend: where kNN available, use weighted average; else keep index df = df.with_columns( pl.when( pl.col("Estimated current price").is_not_null() & pl.col("_knn_est").is_not_null() & pl.col("_knn_est").is_finite() & (pl.col("_knn_est") > 0) ) .then( (1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price") + KNN_BLEND_WEIGHT * pl.col("_knn_est") ) .when(pl.col("Estimated current price").is_not_null()) .then(pl.col("Estimated current price")) .otherwise(pl.lit(None)) .alias("Estimated current price"), ) n_blended = df.filter( pl.col("_knn_est").is_not_null() & pl.col("_knn_est").is_finite() & (pl.col("_knn_est") > 0) & pl.col("Estimated current price").is_not_null() ).height print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates") # Derive estimated price per sqm where both estimated price and floor area exist df = df.with_columns( (pl.col("Estimated current price") / pl.col("Total floor area (sqm)")) .round(0) .cast(pl.Int32, strict=False) .alias("Est. price per sqm"), ) # Drop all temporary columns temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")] df = df.drop(temp_cols) df.write_parquet(args.input) size_mb = args.input.stat().st_size / (1024 * 1024) print(f"\nWrote {args.input} ({size_mb:.1f} MB)") print( f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')" ) if __name__ == "__main__": main()