This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -20,7 +20,6 @@ from pipeline.transform.price_estimation.knn import (
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
@ -91,7 +90,7 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction with interpolation, capping, and seasonal adjustment."""
"""Index-based prediction with interpolation and capping."""
test = interpolate_log_index(
index, test, "sector", "type_group", "input_frac_year", "log_index_input"
)
@ -105,7 +104,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
* (pl.col("log_index_actual") - pl.col("log_index_input"))
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
@ -175,7 +173,10 @@ def print_metrics_table(metrics_by_stage: dict):
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
"--input", type=Path, required=True, help="Path to properties.parquet"
)
parser.add_argument(
"--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
@ -184,38 +185,28 @@ def main():
# Build index from pre-test data only (temporal holdout)
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
print(
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
# Compute seasonal factors from pre-test data only
seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
test = extract_test_set(args.input)
# Compute seasonal adjustment for each test pair
input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
test = test.with_columns(
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
# Join lat/lon from postcode.parquet (properties.parquet no longer has them)
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
test = test.join(postcodes, on="Postcode", how="left")
print("\nPredicting with price index...")
test = predict(test, index)
# --- kNN ---
ref_fy = float(TEST_YEAR_MIN)
trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
# Pass joined LazyFrame (with lat/lon) instead of raw properties path
pool_lf = pl.scan_parquet(args.input).join(
postcodes.lazy(), on="Postcode", how="left"
)
trees = build_knn_pool(pool_lf, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
# Interpolate log_index at reference year for temporal adjustment
test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))

View file

@ -1,19 +1,18 @@
"""Augment wide.parquet with estimated current prices.
"""Augment properties.parquet with estimated current prices.
For properties with a known prior sale, applies the repeat-sales price index
to adjust the last known price to the current date, then blends with kNN
estimates from nearby recently-sold properties. Includes:
- Capping extreme index adjustments
- Seasonal month-of-sale adjustment
- kNN spatial blending
Modifies wide.parquet in-place.
Modifies properties.parquet in-place. Temporarily joins postcode.parquet
for lat/lon needed by kNN, then drops those columns before writing.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
@ -23,9 +22,7 @@ from pipeline.transform.price_estimation.knn import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
CURRENT_MONTH,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
@ -34,48 +31,39 @@ from pipeline.transform.price_estimation.utils import (
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
description="Augment properties.parquet with estimated current prices"
)
parser.add_argument(
"--input",
"--properties",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
help="Path to properties.parquet (modified in-place)",
)
parser.add_argument(
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon needed by kNN)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print("Loading properties.parquet...")
df = pl.read_parquet(args.properties)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Join lat/lon from postcode.parquet for kNN spatial queries
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
df = df.join(postcodes, on="Postcode", how="left")
print(f" Joined lat/lon from {len(postcodes):,} postcodes")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Compute seasonal factors
seasonal = compute_seasonal_factors(args.input)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
sale_month = (
df["Date of last transaction"]
.dt.month()
.fill_null(6)
.to_numpy()
.astype(np.int32)
)
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
# Derive helper columns
df = df.with_columns(
sector_expr().alias("_sector"),
@ -86,7 +74,6 @@ def main():
).alias("_sale_frac_year"),
type_group_expr().alias("_type_group"),
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
index = pl.read_parquet(args.index)
@ -109,7 +96,7 @@ def main():
"_log_index_current_interp",
)
# Compute index-adjusted estimate with cap and seasonal adjustment
# Compute index-adjusted estimate with cap
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
@ -125,7 +112,6 @@ def main():
)
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
@ -140,7 +126,7 @@ def main():
# --- kNN blending ---
print("\nBuilding kNN estimates...")
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
trees = build_knn_pool(df.lazy(), index, CURRENT_FRAC_YEAR)
lat = df["lat"].cast(pl.Float64).to_numpy()
lon = df["lon"].cast(pl.Float64).to_numpy()
@ -188,13 +174,13 @@ def main():
.alias("Est. price per sqm"),
)
# Drop all temporary columns
# Drop all temporary columns and joined lat/lon (those belong in postcode.parquet)
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols)
df = df.drop(temp_cols).drop("lat", "lon")
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
df.write_parquet(args.properties)
size_mb = args.properties.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.properties} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)

View file

@ -328,14 +328,19 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
return filled
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
def build_index(
input_path: Path,
max_pair_year: int | None = None,
postcodes_path: Path | None = None,
) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
centroids = extract_centroids(input_path)
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
@ -448,10 +453,12 @@ def main():
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--postcodes", type=Path, required=True,
help="Path to postcode.parquet (for lat/lon centroids)")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
result = build_index(args.input)
result = build_index(args.input, postcodes_path=args.postcodes)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -29,7 +29,7 @@ def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
def build_knn_pool(
input_path: Path,
source: Path | pl.LazyFrame,
index: pl.DataFrame,
ref_frac_year: float,
max_sale_year: int | None = None,
@ -42,8 +42,9 @@ def build_knn_pool(
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
query = (
pl.scan_parquet(input_path)
lf
.select(
"Postcode",
"Property type",

View file

@ -1,7 +1,6 @@
"""Shared utilities for price estimation modules."""
from datetime import date
from pathlib import Path
import numpy as np
import polars as pl
@ -9,7 +8,6 @@ import polars as pl
CURRENT_YEAR = 2026
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
CURRENT_MONTH = _today.month
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
@ -181,53 +179,3 @@ def join_type_stratified_index(
).drop(_typed, _all)
return df
def compute_seasonal_factors(
input_path: Path, max_sale_year: int | None = None
) -> np.ndarray:
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
Detrends by normalizing median £/sqm within each year, then averages
across years. Returns array of 12 factors (index 0 = January).
Normalized so mean = 1.0.
"""
query = (
pl.scan_parquet(input_path)
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Date of last transaction").is_not_null(),
)
.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
).alias("psm"),
pl.col("Date of last transaction").dt.month().alias("month"),
pl.col("Date of last transaction").dt.year().alias("year"),
)
)
if max_sale_year is not None:
query = query.filter(pl.col("year") < max_sale_year)
monthly = (
query.group_by("year", "month")
.agg(pl.col("psm").median().alias("median_psm"))
.with_columns(
pl.col("median_psm").mean().over("year").alias("year_mean"),
)
.with_columns(
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
)
.group_by("month")
.agg(pl.col("ratio").mean().alias("factor"))
.sort("month")
.collect()
)
factors = monthly["factor"].to_numpy().astype(np.float64)
return factors / factors.mean()