This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -1,19 +1,18 @@
"""Augment wide.parquet with estimated current prices.
"""Augment properties.parquet with estimated current prices.
For properties with a known prior sale, applies the repeat-sales price index
to adjust the last known price to the current date, then blends with kNN
estimates from nearby recently-sold properties. Includes:
- Capping extreme index adjustments
- Seasonal month-of-sale adjustment
- kNN spatial blending
Modifies wide.parquet in-place.
Modifies properties.parquet in-place. Temporarily joins postcode.parquet
for lat/lon needed by kNN, then drops those columns before writing.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
@ -23,9 +22,7 @@ from pipeline.transform.price_estimation.knn import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
CURRENT_MONTH,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
@ -34,48 +31,39 @@ from pipeline.transform.price_estimation.utils import (
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
description="Augment properties.parquet with estimated current prices"
)
parser.add_argument(
"--input",
"--properties",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
help="Path to properties.parquet (modified in-place)",
)
parser.add_argument(
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon needed by kNN)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print("Loading properties.parquet...")
df = pl.read_parquet(args.properties)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Join lat/lon from postcode.parquet for kNN spatial queries
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
df = df.join(postcodes, on="Postcode", how="left")
print(f" Joined lat/lon from {len(postcodes):,} postcodes")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Compute seasonal factors
seasonal = compute_seasonal_factors(args.input)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
sale_month = (
df["Date of last transaction"]
.dt.month()
.fill_null(6)
.to_numpy()
.astype(np.int32)
)
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
# Derive helper columns
df = df.with_columns(
sector_expr().alias("_sector"),
@ -86,7 +74,6 @@ def main():
).alias("_sale_frac_year"),
type_group_expr().alias("_type_group"),
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
index = pl.read_parquet(args.index)
@ -109,7 +96,7 @@ def main():
"_log_index_current_interp",
)
# Compute index-adjusted estimate with cap and seasonal adjustment
# Compute index-adjusted estimate with cap
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
@ -125,7 +112,6 @@ def main():
)
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
@ -140,7 +126,7 @@ def main():
# --- kNN blending ---
print("\nBuilding kNN estimates...")
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
trees = build_knn_pool(df.lazy(), index, CURRENT_FRAC_YEAR)
lat = df["lat"].cast(pl.Float64).to_numpy()
lon = df["lon"].cast(pl.Float64).to_numpy()
@ -188,13 +174,13 @@ def main():
.alias("Est. price per sqm"),
)
# Drop all temporary columns
# Drop all temporary columns and joined lat/lon (those belong in postcode.parquet)
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols)
df = df.drop(temp_cols).drop("lat", "lon")
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
df.write_parquet(args.properties)
size_mb = args.properties.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.properties} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)