lmao
This commit is contained in:
parent
03445188ea
commit
524580eb25
102 changed files with 36625 additions and 1295 deletions
204
pipeline/transform/price_estimation/estimate.py
Normal file
204
pipeline/transform/price_estimation/estimate.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""Augment wide.parquet with estimated current prices.
|
||||
|
||||
For properties with a known prior sale, applies the repeat-sales price index
|
||||
to adjust the last known price to the current date, then blends with kNN
|
||||
estimates from nearby recently-sold properties. Includes:
|
||||
- Capping extreme index adjustments
|
||||
- Seasonal month-of-sale adjustment
|
||||
- kNN spatial blending
|
||||
|
||||
Modifies wide.parquet in-place.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_FRAC_YEAR,
|
||||
CURRENT_MONTH,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
compute_seasonal_factors,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Augment wide.parquet with estimated current prices"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to wide.parquet (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Compute seasonal factors
|
||||
seasonal = compute_seasonal_factors(args.input)
|
||||
months = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
|
||||
]
|
||||
print(
|
||||
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
|
||||
)
|
||||
|
||||
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
|
||||
sale_month = (
|
||||
df["Date of last transaction"]
|
||||
.dt.month()
|
||||
.fill_null(6)
|
||||
.to_numpy()
|
||||
.astype(np.int32)
|
||||
)
|
||||
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
|
||||
|
||||
# Derive helper columns
|
||||
df = df.with_columns(
|
||||
sector_expr().alias("_sector"),
|
||||
(
|
||||
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
|
||||
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
|
||||
/ 12.0
|
||||
).alias("_sale_frac_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
|
||||
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(
|
||||
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
|
||||
print("\nApplying repeat-sales index with fractional year interpolation...")
|
||||
|
||||
df = interpolate_log_index(
|
||||
index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp"
|
||||
)
|
||||
df = interpolate_log_index(
|
||||
index,
|
||||
df,
|
||||
"_sector",
|
||||
"_type_group",
|
||||
"_current_frac_year",
|
||||
"_log_index_current_interp",
|
||||
)
|
||||
|
||||
# Compute index-adjusted estimate with cap and seasonal adjustment
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
& pl.col("Date of last transaction").is_not_null()
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (
|
||||
pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
|
||||
)
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
* pl.col("_seasonal_adj")
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height
|
||||
n_with_price = df.filter(has_price).height
|
||||
print(
|
||||
f" {n_estimated:,} of {n_with_price:,} properties estimated "
|
||||
f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# --- kNN blending ---
|
||||
print("\nBuilding kNN estimates...")
|
||||
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
|
||||
|
||||
lat = df["lat"].cast(pl.Float64).to_numpy()
|
||||
lon = df["lon"].cast(pl.Float64).to_numpy()
|
||||
tg = df["_type_group"].fill_null("").to_numpy()
|
||||
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
|
||||
|
||||
knn_psm = knn_median_psm(trees, lat, lon, tg)
|
||||
knn_est = knn_psm * fa # No temporal adj: ref == current
|
||||
|
||||
df = df.with_columns(
|
||||
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
# Blend: where kNN available, use weighted average; else keep index
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
)
|
||||
.then(
|
||||
(1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
|
||||
+ KNN_BLEND_WEIGHT * pl.col("_knn_est")
|
||||
)
|
||||
.when(pl.col("Estimated current price").is_not_null())
|
||||
.then(pl.col("Estimated current price"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_blended = df.filter(
|
||||
pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
& pl.col("Estimated current price").is_not_null()
|
||||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
# Drop all temporary columns
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
df = df.drop(temp_cols)
|
||||
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue