perfect-postcode/pipeline/transform/price_estimation/estimate.py
2026-05-14 08:17:10 +01:00

246 lines
7.9 KiB
Python

"""Augment properties.parquet with estimated current prices.
For properties with a known prior sale, applies the repeat-sales price index
to adjust the last known price to the current date, then blends with kNN
estimates from nearby recently-sold properties. Includes:
- Capping extreme index adjustments
- kNN spatial blending
Modifies properties.parquet in-place. Temporarily joins postcode.parquet
for lat/lon needed by kNN, then drops those columns before writing.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
build_knn_pool,
knn_median_psm,
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
MAX_LOG_ADJUSTMENT,
interpolate_log_index,
sector_expr,
type_group_expr,
)
MAX_KNN_TO_INDEX_RATIO = 2.0
MIN_KNN_TO_INDEX_RATIO = 0.5
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
def guarded_blend_estimates(
index_est: np.ndarray,
knn_est: np.ndarray,
last_prices: np.ndarray,
weight: float = KNN_BLEND_WEIGHT,
) -> np.ndarray:
"""Blend only stable kNN estimates and cap final uplift from last sale price."""
index_est = index_est.astype(np.float64, copy=False)
knn_est = knn_est.astype(np.float64, copy=False)
last_prices = last_prices.astype(np.float64, copy=False)
has_index = np.isfinite(index_est) & (index_est > 0)
has_knn = np.isfinite(knn_est) & (knn_est > 0)
stable_knn = has_knn & (
has_index
& (knn_est >= index_est * MIN_KNN_TO_INDEX_RATIO)
& (knn_est <= index_est * MAX_KNN_TO_INDEX_RATIO)
)
blended = np.where(
has_index & stable_knn,
(1 - weight) * index_est + weight * knn_est,
np.where(has_index, index_est, np.nan),
)
cap = np.where(
np.isfinite(last_prices) & (last_prices > 0),
last_prices * MAX_ESTIMATE_TO_LAST_PRICE_RATIO,
np.nan,
)
return np.where(
np.isfinite(cap) & np.isfinite(blended), np.minimum(blended, cap), blended
)
def main():
parser = argparse.ArgumentParser(
description="Augment properties.parquet with estimated current prices"
)
parser.add_argument(
"--properties",
type=Path,
required=True,
help="Path to properties.parquet (modified in-place)",
)
parser.add_argument(
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon needed by kNN)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
args = parser.parse_args()
print("Loading properties.parquet...")
df = pl.read_parquet(args.properties)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Join lat/lon from postcode.parquet for kNN spatial queries
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
df = df.join(postcodes, on="Postcode", how="left")
print(f" Joined lat/lon from {len(postcodes):,} postcodes")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Derive helper columns
df = df.with_columns(
sector_expr().alias("_sector"),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
/ 12.0
).alias("_sale_frac_year"),
type_group_expr().alias("_type_group"),
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
)
index = pl.read_parquet(args.index)
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
print("\nApplying repeat-sales index with fractional year interpolation...")
df = interpolate_log_index(
index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp"
)
df = interpolate_log_index(
index,
df,
"_sector",
"_type_group",
"_current_frac_year",
"_log_index_current_interp",
)
# Compute index-adjusted estimate with cap
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
& pl.col("Date of last transaction").is_not_null()
)
df = df.with_columns(
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp"))
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height
n_with_price = df.filter(has_price).height
print(
f" {n_estimated:,} of {n_with_price:,} properties estimated "
f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)"
)
# --- kNN blending ---
print("\nBuilding kNN estimates...")
trees = build_knn_pool(df.lazy(), index, CURRENT_FRAC_YEAR)
lat = df["lat"].cast(pl.Float64).to_numpy()
lon = df["lon"].cast(pl.Float64).to_numpy()
tg = df["_type_group"].fill_null("").to_numpy()
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
last_prices = (
df["Last known price"].cast(pl.Float64).fill_null(float("nan")).to_numpy()
)
last_sale_dates = (
df["Date of last transaction"]
.dt.epoch("d")
.fill_null(-1)
.to_numpy()
.astype(np.int64)
)
knn_psm = knn_median_psm(
trees,
lat,
lon,
tg,
postcodes=df["Postcode"].fill_null("").to_numpy(),
last_prices=last_prices,
last_sale_dates=last_sale_dates,
)
knn_est = knn_psm * fa # No temporal adj: ref == current
df = df.with_columns(
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
)
# Blend only when kNN is close to the index estimate; otherwise keep index.
index_est = (
df["Estimated current price"]
.cast(pl.Float64)
.fill_null(float("nan"))
.to_numpy()
)
blended = guarded_blend_estimates(index_est, knn_est, last_prices)
df = df.with_columns(
pl.Series("_index_est", index_est, dtype=pl.Float64),
pl.Series("Estimated current price", blended, dtype=pl.Float64),
).with_columns(
pl.col("Estimated current price").fill_nan(None),
)
n_blended = df.filter(
pl.col("_knn_est").is_not_null()
& pl.col("_knn_est").is_finite()
& (pl.col("_knn_est") > 0)
& (pl.col("_index_est").is_not_null())
& (pl.col("_knn_est") >= pl.col("_index_est") * MIN_KNN_TO_INDEX_RATIO)
& (pl.col("_knn_est") <= pl.col("_index_est") * MAX_KNN_TO_INDEX_RATIO)
& pl.col("Estimated current price").is_not_null()
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
.alias("Est. price per sqm"),
)
# Drop all temporary columns and joined lat/lon (those belong in postcode.parquet)
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols).drop("lat", "lon")
df.write_parquet(args.properties)
size_mb = args.properties.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.properties} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)
if __name__ == "__main__":
main()