"""Apply repeat-sales price index to estimate current property prices. Joins the precomputed price index (from price_index.py) with each property's last known sale to produce an inflation-adjusted current price estimate. Output: estimated_prices.parquet with per-property estimates. """ import argparse from pathlib import Path import polars as pl CURRENT_YEAR = 2025 def main(): parser = argparse.ArgumentParser(description="Estimate current property prices") parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet") parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet") parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet") args = parser.parse_args() print("Loading property data...") df = ( pl.scan_parquet(args.input) .select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction") .filter( pl.col("Last known price").is_not_null(), pl.col("Postcode").is_not_null(), ) .with_columns( pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"), pl.col("Date of last transaction").dt.year().alias("sale_year"), ) .collect() ) print(f" {len(df):,} properties with known price and postcode") index = pl.read_parquet(args.index) print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors") print("\nApplying repeat-sales index...") # Join index at sale year df = df.join( index.select("sector", "year", pl.col("log_index").alias("log_index_sale")), left_on=["sector", "sale_year"], right_on=["sector", "year"], how="left", ) # Join index at current year index_current = ( index.filter(pl.col("year") == CURRENT_YEAR) .select("sector", pl.col("log_index").alias("log_index_current")) ) df = df.join(index_current, on="sector", how="left") # Compute estimate; fall back to raw price when no index available df = df.with_columns( ( pl.col("Last known price").cast(pl.Float64) * (pl.col("log_index_current") - pl.col("log_index_sale")).exp() ) .fill_null(pl.col("Last known price").cast(pl.Float64)) .alias("estimated_price"), ) n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)") # Select output columns output = df.select( "Postcode", "Address per Property Register", pl.col("Last known price").alias("last_price"), "sale_year", "sector", "estimated_price", ) output.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) print(f"\nWrote {args.output} ({size_mb:.1f} MB)") print(f" {len(output):,} rows") if __name__ == "__main__": main()