Refactor and other improvements
This commit is contained in:
parent
04a78e7bfe
commit
6c90cf3c0f
47 changed files with 2705 additions and 1568 deletions
167
pipeline/transform/price_backtest.py
Normal file
167
pipeline/transform/price_backtest.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
"""Backtesting: Evaluate price index model on held-out recent sales.
|
||||
|
||||
Test set: properties with 2+ sales where the last sale is 2022-2025.
|
||||
Uses the second-to-last sale as input, predicts the last sale price.
|
||||
Compares index-based prediction against a naive baseline (raw input price).
|
||||
|
||||
Output: backtest_results.parquet with predictions vs actuals.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
TEST_YEAR_MIN = 2022
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
|
||||
print("Loading test set...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
|
||||
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
|
||||
# Second-to-last sale (input)
|
||||
pl.col("historical_prices").list.get(-2).struct.field("year").alias("input_year"),
|
||||
pl.col("historical_prices").list.get(-2).struct.field("price").alias("input_price"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("actual_year") >= TEST_YEAR_MIN,
|
||||
pl.col("input_price") > 0,
|
||||
pl.col("actual_price") > 0,
|
||||
pl.col("actual_year") > pl.col("input_year"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
|
||||
return df
|
||||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction: adjust input price by sector index change."""
|
||||
# Join index at input year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join index at actual year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
pl.col("input_price").cast(pl.Float64)
|
||||
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
|
||||
).fill_null(pl.col("input_price").cast(pl.Float64)).alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
"""Compute error metrics."""
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
||||
ape = np.abs(predicted - actual) / actual
|
||||
signed_err = predicted - actual
|
||||
|
||||
return {
|
||||
"MdAPE (%)": float(np.median(ape) * 100),
|
||||
"% within 10%": float(np.mean(ape <= 0.10) * 100),
|
||||
"% within 20%": float(np.mean(ape <= 0.20) * 100),
|
||||
"% within 30%": float(np.mean(ape <= 0.30) * 100),
|
||||
"MAE (£)": float(np.mean(np.abs(signed_err))),
|
||||
"Mean signed error (£)": float(np.mean(signed_err)),
|
||||
"n": int(len(actual)),
|
||||
}
|
||||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
"""Print a comparison table of metrics."""
|
||||
print("\n" + "=" * 55)
|
||||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
||||
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
# Header
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>14s}"
|
||||
print(header)
|
||||
print("-" * 55)
|
||||
|
||||
for metric in metric_names:
|
||||
row = f"{metric:<25s}"
|
||||
for stage in stages:
|
||||
val = metrics_by_stage[stage][metric]
|
||||
if metric == "n":
|
||||
row += f" {val:>14,d}"
|
||||
elif "£" in metric:
|
||||
row += f" {val:>13,.0f}"
|
||||
else:
|
||||
row += f" {val:>13.1f}%"
|
||||
print(row)
|
||||
|
||||
print("=" * 55)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backtest price estimation model")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output backtest_results.parquet")
|
||||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
print("\nPredicting with price index...")
|
||||
test = predict(test, index)
|
||||
|
||||
# Compute and print metrics
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
metrics = {
|
||||
"Naive": compute_metrics(actual, test["input_price"].to_numpy().astype(np.float64)),
|
||||
"Index": compute_metrics(actual, test["predicted"].to_numpy().astype(np.float64)),
|
||||
}
|
||||
|
||||
print_metrics_table(metrics)
|
||||
|
||||
# Save results
|
||||
result = test.select(
|
||||
"Postcode", "sector",
|
||||
"input_year", "input_price",
|
||||
"actual_year", "actual_price",
|
||||
"predicted",
|
||||
)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
90
pipeline/transform/price_estimate.py
Normal file
90
pipeline/transform/price_estimate.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""Apply repeat-sales price index to estimate current property prices.
|
||||
|
||||
Joins the precomputed price index (from price_index.py) with each property's
|
||||
last known sale to produce an inflation-adjusted current price estimate.
|
||||
|
||||
Output: estimated_prices.parquet with per-property estimates.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Estimate current property prices")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading property data...")
|
||||
df = (
|
||||
pl.scan_parquet(args.input)
|
||||
.select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction")
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Postcode").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with known price and postcode")
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
|
||||
print("\nApplying repeat-sales index...")
|
||||
|
||||
# Join index at sale year
|
||||
df = df.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_sale")),
|
||||
left_on=["sector", "sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Join index at current year
|
||||
index_current = (
|
||||
index.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("log_index_current"))
|
||||
)
|
||||
df = df.join(index_current, on="sector", how="left")
|
||||
|
||||
# Compute estimate; fall back to raw price when no index available
|
||||
df = df.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("log_index_current") - pl.col("log_index_sale")).exp()
|
||||
)
|
||||
.fill_null(pl.col("Last known price").cast(pl.Float64))
|
||||
.alias("estimated_price"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height
|
||||
print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)")
|
||||
|
||||
# Select output columns
|
||||
output = df.select(
|
||||
"Postcode",
|
||||
"Address per Property Register",
|
||||
pl.col("Last known price").alias("last_price"),
|
||||
"sale_year",
|
||||
"sector",
|
||||
"estimated_price",
|
||||
)
|
||||
|
||||
output.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(output):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
272
pipeline/transform/price_index.py
Normal file
272
pipeline/transform/price_index.py
Normal file
|
|
@ -0,0 +1,272 @@
|
|||
"""Stage 1: Repeat-Sales Price Index
|
||||
|
||||
Builds a hierarchical Case-Shiller repeat-sales price index from historical
|
||||
transaction data. Solves WLS regression per postcode sector, district, area,
|
||||
and nationally, then applies Bayesian shrinkage toward parent geographies.
|
||||
|
||||
Output: price_index.parquet with columns: sector, year, log_index, n_pairs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from tqdm import tqdm
|
||||
|
||||
MIN_PAIRS = 5 # minimum pairs to compute an index
|
||||
SHRINKAGE_K = 50 # shrinkage parameter: higher = more shrinkage toward parent
|
||||
OUTLIER_THRESHOLD = 2.5 # |log_ratio| > this → drop (>12x price change)
|
||||
|
||||
|
||||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||||
"""Extract consecutive sale pairs from historical_prices."""
|
||||
print("Loading historical prices...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
print("Extracting consecutive pairs...")
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
pl.col("historical_prices").list.slice(0, pl.col("historical_prices").list.len() - 1).alias("from_txn"),
|
||||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||||
)
|
||||
.explode("from_txn", "to_txn")
|
||||
.with_columns(
|
||||
pl.col("from_txn").struct.field("year").alias("year1"),
|
||||
pl.col("from_txn").struct.field("price").alias("price1"),
|
||||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select("sector", "year1", "price1", "year2", "price2")
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(pairs):,} consecutive pairs extracted")
|
||||
return pairs
|
||||
|
||||
|
||||
def solve_wls_index(years1: np.ndarray, years2: np.ndarray, log_ratios: np.ndarray, weights: np.ndarray) -> dict[int, float]:
|
||||
"""Solve WLS repeat-sales regression for a set of pairs.
|
||||
|
||||
Model: log(P2/P1) = beta[year2] - beta[year1], weighted by 1/sqrt(gap).
|
||||
Pin beta[min_year] = 0.
|
||||
Returns dict mapping year -> log_index (cumulative).
|
||||
"""
|
||||
if len(years1) < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
all_years = np.union1d(years1, years2)
|
||||
min_year = int(all_years.min())
|
||||
# Map years to column indices, skipping min_year (pinned to 0)
|
||||
col = 0
|
||||
year_to_col = {}
|
||||
for y in all_years:
|
||||
if int(y) != min_year:
|
||||
year_to_col[int(y)] = col
|
||||
col += 1
|
||||
n_cols = len(year_to_col)
|
||||
if n_cols == 0:
|
||||
return {}
|
||||
|
||||
n_rows = len(years1)
|
||||
row_idx = []
|
||||
col_idx = []
|
||||
data = []
|
||||
|
||||
for i in range(n_rows):
|
||||
y1, y2 = int(years1[i]), int(years2[i])
|
||||
if y2 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y2])
|
||||
data.append(weights[i])
|
||||
if y1 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y1])
|
||||
data.append(-weights[i])
|
||||
|
||||
A = csc_matrix((data, (row_idx, col_idx)), shape=(n_rows, n_cols))
|
||||
b = log_ratios * weights
|
||||
|
||||
result = lsqr(A, b, atol=1e-10, btol=1e-10)
|
||||
betas = result[0]
|
||||
|
||||
index = {min_year: 0.0}
|
||||
for year, col in year_to_col.items():
|
||||
index[year] = float(betas[col])
|
||||
return index
|
||||
|
||||
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str) -> dict[str, dict[int, float]]:
|
||||
"""Compute raw indices for each geographic group."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
|
||||
)
|
||||
|
||||
indices = {}
|
||||
n_pairs_map = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" Solving {group_col}"):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||||
w = np.array(row["weight"], dtype=np.float64)
|
||||
idx = solve_wls_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs_map[key] = len(y1)
|
||||
return indices, n_pairs_map
|
||||
|
||||
|
||||
def shrink_index(raw: dict[int, float], parent: dict[int, float], n_pairs: int) -> dict[int, float]:
|
||||
"""Bayesian shrinkage toward parent index."""
|
||||
w = n_pairs / (n_pairs + SHRINKAGE_K)
|
||||
result = {}
|
||||
all_years = set(raw.keys()) | set(parent.keys())
|
||||
for y in all_years:
|
||||
raw_val = raw.get(y, parent.get(y, 0.0))
|
||||
parent_val = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * raw_val + (1 - w) * parent_val
|
||||
return result
|
||||
|
||||
|
||||
def forward_fill_index(index: dict[int, float], min_year: int, max_year: int) -> dict[int, float]:
|
||||
"""Forward-fill missing years so index is continuous."""
|
||||
filled = {}
|
||||
last_val = 0.0
|
||||
for y in range(min_year, max_year + 1):
|
||||
if y in index:
|
||||
last_val = index[y]
|
||||
filled[y] = last_val
|
||||
return filled
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build repeat-sales price index")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output price_index.parquet")
|
||||
args = parser.parse_args()
|
||||
|
||||
pairs = extract_pairs(args.input)
|
||||
|
||||
# Derive geographic hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
# district = sector minus trailing digit(s), e.g. "SW1A 1" -> "SW1A"
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
# area = leading letters only, e.g. "SW1A" -> "SW"
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
# Solve indices at each level
|
||||
print("\nComputing national index...")
|
||||
pairs_np = pairs.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_wls_index(
|
||||
pairs_np["year1"].to_numpy(),
|
||||
pairs_np["year2"].to_numpy(),
|
||||
pairs_np["log_ratio"].to_numpy(),
|
||||
pairs_np["weight"].to_numpy(),
|
||||
)
|
||||
print(f" National index: {len(national_idx)} years")
|
||||
|
||||
print("\nComputing area indices...")
|
||||
area_indices, area_pairs = compute_indices_for_level(pairs, "area")
|
||||
print(f" {len(area_indices)} areas with indices")
|
||||
|
||||
print("\nComputing district indices...")
|
||||
district_indices, district_pairs = compute_indices_for_level(pairs, "district")
|
||||
print(f" {len(district_indices)} districts with indices")
|
||||
|
||||
print("\nComputing sector indices...")
|
||||
sector_indices, sector_pairs = compute_indices_for_level(pairs, "sector")
|
||||
print(f" {len(sector_indices)} sectors with indices")
|
||||
|
||||
# Shrink area -> national
|
||||
print("\nApplying hierarchical shrinkage...")
|
||||
for area, idx in tqdm(area_indices.items(), desc=" Area shrinkage"):
|
||||
area_indices[area] = shrink_index(idx, national_idx, area_pairs[area])
|
||||
|
||||
# Shrink district -> area
|
||||
for dist, idx in tqdm(district_indices.items(), desc=" District shrinkage"):
|
||||
area = dist.replace(r"\d.*$", "")
|
||||
# Extract area from district (leading letters)
|
||||
area_key = ""
|
||||
for ch in dist:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
parent = area_indices.get(area_key, national_idx)
|
||||
district_indices[dist] = shrink_index(idx, parent, district_pairs[dist])
|
||||
|
||||
# Shrink sector -> district
|
||||
for sector, idx in tqdm(sector_indices.items(), desc=" Sector shrinkage"):
|
||||
# District = sector minus trailing space+digit
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
parent = district_indices.get(dist_key, national_idx)
|
||||
sector_indices[sector] = shrink_index(idx, parent, sector_pairs[sector])
|
||||
|
||||
# For sectors without enough data, fall back to district/area/national
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = max(int(pairs["year2"].max()), 2025)
|
||||
|
||||
print(f"\nFilling gaps and forward-filling ({min_year}-{max_year})...")
|
||||
rows = []
|
||||
for sector in tqdm(all_sectors, desc=" Forward-fill"):
|
||||
if sector in sector_indices:
|
||||
idx = sector_indices[sector]
|
||||
else:
|
||||
# Fall back to district, area, national
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area_key = ""
|
||||
for ch in dist_key:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
idx = district_indices.get(dist_key, area_indices.get(area_key, national_idx))
|
||||
|
||||
n = sector_pairs.get(sector, 0)
|
||||
filled = forward_fill_index(idx, min_year, max_year)
|
||||
for year, log_idx in filled.items():
|
||||
rows.append((sector, year, log_idx, n))
|
||||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={"sector": pl.String, "year": pl.Int32, "log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
orient="row",
|
||||
)
|
||||
|
||||
result = result.sort("sector", "year")
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue