90 lines
3 KiB
Python
90 lines
3 KiB
Python
"""Apply repeat-sales price index to estimate current property prices.
|
|
|
|
Joins the precomputed price index (from price_index.py) with each property's
|
|
last known sale to produce an inflation-adjusted current price estimate.
|
|
|
|
Output: estimated_prices.parquet with per-property estimates.
|
|
"""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
CURRENT_YEAR = 2025
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Estimate current property prices")
|
|
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
|
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
|
parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet")
|
|
args = parser.parse_args()
|
|
|
|
print("Loading property data...")
|
|
df = (
|
|
pl.scan_parquet(args.input)
|
|
.select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction")
|
|
.filter(
|
|
pl.col("Last known price").is_not_null(),
|
|
pl.col("Postcode").is_not_null(),
|
|
)
|
|
.with_columns(
|
|
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
|
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
|
)
|
|
.collect()
|
|
)
|
|
print(f" {len(df):,} properties with known price and postcode")
|
|
|
|
index = pl.read_parquet(args.index)
|
|
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
|
|
|
print("\nApplying repeat-sales index...")
|
|
|
|
# Join index at sale year
|
|
df = df.join(
|
|
index.select("sector", "year", pl.col("log_index").alias("log_index_sale")),
|
|
left_on=["sector", "sale_year"],
|
|
right_on=["sector", "year"],
|
|
how="left",
|
|
)
|
|
|
|
# Join index at current year
|
|
index_current = (
|
|
index.filter(pl.col("year") == CURRENT_YEAR)
|
|
.select("sector", pl.col("log_index").alias("log_index_current"))
|
|
)
|
|
df = df.join(index_current, on="sector", how="left")
|
|
|
|
# Compute estimate; fall back to raw price when no index available
|
|
df = df.with_columns(
|
|
(
|
|
pl.col("Last known price").cast(pl.Float64)
|
|
* (pl.col("log_index_current") - pl.col("log_index_sale")).exp()
|
|
)
|
|
.fill_null(pl.col("Last known price").cast(pl.Float64))
|
|
.alias("estimated_price"),
|
|
)
|
|
|
|
n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height
|
|
print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)")
|
|
|
|
# Select output columns
|
|
output = df.select(
|
|
"Postcode",
|
|
"Address per Property Register",
|
|
pl.col("Last known price").alias("last_price"),
|
|
"sale_year",
|
|
"sector",
|
|
"estimated_price",
|
|
)
|
|
|
|
output.write_parquet(args.output)
|
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
|
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
|
print(f" {len(output):,} rows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|