More
This commit is contained in:
parent
128b3191e7
commit
03445188ea
54 changed files with 596953 additions and 3577 deletions
|
|
@ -1,121 +0,0 @@
|
|||
"""Shared utilities for price index, price estimate, and renovation premium scripts."""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
TERRACE_TYPES = [
|
||||
"Mid-Terrace",
|
||||
"End-Terrace",
|
||||
"Enclosed Mid-Terrace",
|
||||
"Enclosed End-Terrace",
|
||||
"Terraced",
|
||||
]
|
||||
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
|
||||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
|
||||
SHRINKAGE_K = 50
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type -> type_group."""
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
|
||||
.then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type").is_in(FLAT_TYPES))
|
||||
.then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type") == "Bungalow")
|
||||
.then(pl.lit("Bungalow"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
|
||||
.then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def sector_expr():
|
||||
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
|
||||
return (
|
||||
pl.col("Postcode")
|
||||
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
|
||||
.str.strip_chars()
|
||||
.alias("sector")
|
||||
)
|
||||
|
||||
|
||||
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
||||
"""Return (district, area) for a sector string."""
|
||||
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area = ""
|
||||
for ch in district:
|
||||
if ch.isalpha():
|
||||
area += ch
|
||||
else:
|
||||
break
|
||||
return district, area
|
||||
|
||||
|
||||
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
|
||||
AGE_LABELS = [
|
||||
"pre-1900",
|
||||
"1900-1929",
|
||||
"1930-1949",
|
||||
"1950-1966",
|
||||
"1967-1982",
|
||||
"1983-1999",
|
||||
"2000-2009",
|
||||
"2010+",
|
||||
]
|
||||
|
||||
HEDONIC_COLUMNS = [
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"Property type",
|
||||
"Total floor area (sqm)",
|
||||
"Postcode",
|
||||
]
|
||||
|
||||
|
||||
def age_band_expr():
|
||||
"""Polars expression: Construction age (UInt16 year) → age band string."""
|
||||
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
|
||||
for i, brk in enumerate(AGE_BREAKS):
|
||||
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
|
||||
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
|
||||
|
||||
|
||||
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
|
||||
|
||||
|
||||
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
|
||||
"""Build hedonic feature matrix from a DataFrame with type_group column.
|
||||
|
||||
Columns (5 total): log(floor_area), 4 type dummies (ref: Detached).
|
||||
Sector fixed effects do the heavy lifting — additional property features
|
||||
(EPC, rooms, age) add no predictive value after sector demeaning.
|
||||
"""
|
||||
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
||||
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
|
||||
tg = df["type_group"].to_numpy()
|
||||
parts = [log_fa]
|
||||
for t in NON_REF_TYPES:
|
||||
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
||||
return np.hstack(parts)
|
||||
|
||||
|
||||
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
|
||||
"""Compute mean lat/lon per postcode sector."""
|
||||
print("Computing sector centroids...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "lat", "lon")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
||||
.with_columns(sector_expr())
|
||||
.group_by("sector")
|
||||
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
||||
.collect()
|
||||
)
|
||||
centroids = {}
|
||||
for row in df.iter_rows(named=True):
|
||||
centroids[row["sector"]] = (row["lat"], row["lon"])
|
||||
print(f" {len(centroids):,} sector centroids")
|
||||
return centroids
|
||||
|
|
@ -1,300 +0,0 @@
|
|||
"""Cross-Sectional Hedonic Model (Per-Type)
|
||||
|
||||
Trains separate OLS models per property type on recent sales (last 5 years)
|
||||
with sector fixed effects via Frisch-Waugh-Lovell demeaning:
|
||||
|
||||
log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
|
||||
|
||||
Each type gets its own floor area elasticity and sector intercepts, capturing
|
||||
that detached houses (beta=0.74) have higher price sensitivity to size than
|
||||
terraced houses (beta=0.60), and a sector's value differs by property type.
|
||||
|
||||
Sector intercepts are hierarchically shrunk (sector → district → area → national)
|
||||
and spatially smoothed via KD-tree nearest neighbors.
|
||||
|
||||
Output: hedonic_model.json with per-type betas and sector intercepts.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import KDTree
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
HEDONIC_COLUMNS,
|
||||
SHRINKAGE_K,
|
||||
TYPE_GROUPS,
|
||||
extract_centroids,
|
||||
hierarchy_keys,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
TRAINING_YEARS = 5
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
|
||||
|
||||
def load_training_data(input_path: Path) -> pl.DataFrame:
|
||||
"""Load recent sales with complete hedonic features."""
|
||||
min_year = CURRENT_YEAR - TRAINING_YEARS
|
||||
print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(*HEDONIC_COLUMNS)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Postcode").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
sector_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
pl.col("sale_year").is_not_null(),
|
||||
pl.col("sale_year") >= min_year,
|
||||
pl.col("sale_year") <= CURRENT_YEAR,
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} complete cases")
|
||||
return df
|
||||
|
||||
|
||||
def train_type_model(
|
||||
df: pl.DataFrame, type_group: str
|
||||
) -> tuple[float, dict[str, float], dict[str, int], float]:
|
||||
"""Train hedonic model for a single property type.
|
||||
|
||||
Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
|
||||
"""
|
||||
t_df = df.filter(pl.col("type_group") == type_group)
|
||||
y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
|
||||
log_fa = np.log(
|
||||
np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
|
||||
)
|
||||
X = log_fa.reshape(-1, 1)
|
||||
sectors = t_df["sector"].to_list()
|
||||
|
||||
# Group by sector for demeaning
|
||||
sector_indices: dict[str, list[int]] = {}
|
||||
for i, s in enumerate(sectors):
|
||||
sector_indices.setdefault(s, []).append(i)
|
||||
|
||||
# Compute sector means and demean
|
||||
X_demeaned = np.empty_like(X)
|
||||
y_demeaned = np.empty_like(y)
|
||||
sector_X_means: dict[str, np.ndarray] = {}
|
||||
sector_y_means: dict[str, float] = {}
|
||||
sector_counts: dict[str, int] = {}
|
||||
|
||||
for s, idxs in sector_indices.items():
|
||||
idx = np.array(idxs)
|
||||
X_mean = X[idx].mean(axis=0)
|
||||
y_mean = y[idx].mean()
|
||||
sector_X_means[s] = X_mean
|
||||
sector_y_means[s] = y_mean
|
||||
X_demeaned[idx] = X[idx] - X_mean
|
||||
y_demeaned[idx] = y[idx] - y_mean
|
||||
sector_counts[s] = len(idxs)
|
||||
|
||||
# OLS on demeaned data
|
||||
beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
|
||||
beta_fa = float(beta[0])
|
||||
|
||||
# Recover sector intercepts
|
||||
sector_intercepts = {}
|
||||
for s in sector_indices:
|
||||
sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
|
||||
|
||||
national_intercept = float(np.mean(list(sector_intercepts.values())))
|
||||
|
||||
# R-squared
|
||||
y_pred = X[:, 0] * beta_fa
|
||||
for i, s in enumerate(sectors):
|
||||
y_pred[i] += sector_intercepts[s]
|
||||
ss_res = np.sum((y - y_pred) ** 2)
|
||||
ss_tot = np.sum((y - y.mean()) ** 2)
|
||||
r2 = 1 - ss_res / ss_tot
|
||||
|
||||
print(
|
||||
f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} "
|
||||
f"R²={r2:.4f} sectors={len(sector_intercepts):,}"
|
||||
)
|
||||
|
||||
return beta_fa, sector_intercepts, sector_counts, national_intercept
|
||||
|
||||
|
||||
def shrink_intercepts(
|
||||
sector_intercepts: dict[str, float],
|
||||
sector_counts: dict[str, int],
|
||||
) -> dict[str, float]:
|
||||
"""Hierarchical shrinkage: sector -> district -> area -> national."""
|
||||
national = float(np.mean(list(sector_intercepts.values())))
|
||||
|
||||
sector_to_dist: dict[str, str] = {}
|
||||
dist_to_area: dict[str, str] = {}
|
||||
for s in sector_intercepts:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
# Area-level intercepts (weighted mean of sectors in area)
|
||||
area_vals: dict[str, list[tuple[float, int]]] = {}
|
||||
for s, val in sector_intercepts.items():
|
||||
d = sector_to_dist[s]
|
||||
a = dist_to_area[d]
|
||||
area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
|
||||
|
||||
area_intercepts: dict[str, float] = {}
|
||||
area_counts: dict[str, int] = {}
|
||||
for a, entries in area_vals.items():
|
||||
total_n = sum(n for _, n in entries)
|
||||
if total_n > 0:
|
||||
area_intercepts[a] = sum(v * n for v, n in entries) / total_n
|
||||
else:
|
||||
area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
|
||||
area_counts[a] = total_n
|
||||
|
||||
# District-level intercepts
|
||||
dist_vals: dict[str, list[tuple[float, int]]] = {}
|
||||
for s, val in sector_intercepts.items():
|
||||
d = sector_to_dist[s]
|
||||
dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
|
||||
|
||||
dist_intercepts: dict[str, float] = {}
|
||||
dist_counts: dict[str, int] = {}
|
||||
for d, entries in dist_vals.items():
|
||||
total_n = sum(n for _, n in entries)
|
||||
if total_n > 0:
|
||||
dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
|
||||
else:
|
||||
dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
|
||||
dist_counts[d] = total_n
|
||||
|
||||
# Shrink: area -> national
|
||||
area_shrunk: dict[str, float] = {}
|
||||
for a, val in area_intercepts.items():
|
||||
n = area_counts[a]
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
area_shrunk[a] = w * val + (1 - w) * national
|
||||
|
||||
# Shrink: district -> area
|
||||
dist_shrunk: dict[str, float] = {}
|
||||
for d, val in dist_intercepts.items():
|
||||
a = dist_to_area[d]
|
||||
parent = area_shrunk.get(a, national)
|
||||
n = dist_counts[d]
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
dist_shrunk[d] = w * val + (1 - w) * parent
|
||||
|
||||
# Shrink: sector -> district
|
||||
result: dict[str, float] = {}
|
||||
for s, val in sector_intercepts.items():
|
||||
d = sector_to_dist[s]
|
||||
parent = dist_shrunk.get(d, national)
|
||||
n = sector_counts.get(s, 0)
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
result[s] = w * val + (1 - w) * parent
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def spatial_smooth_intercepts(
|
||||
sector_intercepts: dict[str, float],
|
||||
centroids: dict[str, tuple[float, float]],
|
||||
sector_counts: dict[str, int],
|
||||
) -> dict[str, float]:
|
||||
"""Blend sparse sector intercepts with K nearest neighbors."""
|
||||
sectors_with_coords = [s for s in sector_intercepts if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_intercepts
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_intercepts)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = sector_counts.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_vals = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_intercepts:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_vals.append(sector_intercepts[ns])
|
||||
|
||||
if not neighbor_vals:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
blended = self_w * sector_intercepts[sec]
|
||||
for val, iw in zip(neighbor_vals, inv_dists):
|
||||
blended += nbr_w * (iw / total_inv) * val
|
||||
result[sec] = blended
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output hedonic_model.json"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = load_training_data(args.input)
|
||||
centroids = extract_centroids(args.input)
|
||||
|
||||
print("\nTraining per-type models...")
|
||||
type_models = {}
|
||||
total_sectors = 0
|
||||
|
||||
for tg in TYPE_GROUPS:
|
||||
beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
|
||||
|
||||
shrunk = shrink_intercepts(raw_intercepts, sector_counts)
|
||||
smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
|
||||
total_sectors += len(smoothed)
|
||||
|
||||
type_models[tg] = {
|
||||
"beta_fa": beta_fa,
|
||||
"sector_intercepts": smoothed,
|
||||
"national_intercept": national,
|
||||
}
|
||||
|
||||
# Output
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
json.dump({"type_models": type_models}, f, indent=2)
|
||||
|
||||
size_kb = args.output.stat().st_size / 1024
|
||||
print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
|
||||
print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,385 +0,0 @@
|
|||
"""Backtesting: Evaluate price index model on held-out recent sales.
|
||||
|
||||
Test set: properties with 2+ sales where the last sale is 2022-2025.
|
||||
Uses the second-to-last sale as input, predicts the last sale price.
|
||||
Compares index-based prediction against a naive baseline (raw input price).
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: backtest_results.parquet with predictions vs actuals.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
HEDONIC_COLUMNS,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
TEST_YEAR_MIN = 2022
|
||||
|
||||
|
||||
def extract_test_set(
|
||||
input_path: Path, include_hedonic_cols: bool = False
|
||||
) -> pl.DataFrame:
|
||||
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
|
||||
print("Loading test set...")
|
||||
cols = ["Postcode", "historical_prices", "Property type"]
|
||||
if include_hedonic_cols:
|
||||
for c in HEDONIC_COLUMNS:
|
||||
if c not in cols:
|
||||
cols.append(c)
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(cols)
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("year")
|
||||
.alias("actual_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("price")
|
||||
.alias("actual_price"),
|
||||
# Second-to-last sale (input)
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("year")
|
||||
.alias("input_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("price")
|
||||
.alias("input_price"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("actual_year") >= TEST_YEAR_MIN,
|
||||
pl.col("input_price") > 0,
|
||||
pl.col("actual_price") > 0,
|
||||
pl.col("actual_year") > pl.col("input_year"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
|
||||
return df
|
||||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction with type-stratified fallback."""
|
||||
has_type_group = "type_group" in index.columns
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join type-specific index at input year
|
||||
test = test.join(
|
||||
idx_typed.select(
|
||||
"sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
|
||||
),
|
||||
left_on=["sector", "type_group", "input_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at input year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at actual year
|
||||
test = test.join(
|
||||
idx_typed.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"year",
|
||||
pl.col("log_index").alias("li_act_typed"),
|
||||
),
|
||||
left_on=["sector", "type_group", "actual_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at actual year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
pl.col("li_in_typed")
|
||||
.fill_null(pl.col("li_in_all"))
|
||||
.alias("log_index_input"),
|
||||
pl.col("li_act_typed")
|
||||
.fill_null(pl.col("li_act_all"))
|
||||
.alias("log_index_actual"),
|
||||
)
|
||||
else:
|
||||
# Unstratified index
|
||||
test = test.join(
|
||||
index.select(
|
||||
"sector", "year", pl.col("log_index").alias("log_index_input")
|
||||
),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
test = test.join(
|
||||
index.select(
|
||||
"sector", "year", pl.col("log_index").alias("log_index_actual")
|
||||
),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
pl.col("input_price").cast(pl.Float64)
|
||||
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
.alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
||||
ape = np.abs(predicted - actual) / actual
|
||||
signed_err = predicted - actual
|
||||
|
||||
return {
|
||||
"MdAPE (%)": float(np.median(ape) * 100),
|
||||
"% within 10%": float(np.mean(ape <= 0.10) * 100),
|
||||
"% within 20%": float(np.mean(ape <= 0.20) * 100),
|
||||
"% within 30%": float(np.mean(ape <= 0.30) * 100),
|
||||
"MAE (£)": float(np.mean(np.abs(signed_err))),
|
||||
"Mean signed error (£)": float(np.mean(signed_err)),
|
||||
"n": int(len(actual)),
|
||||
}
|
||||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
print("\n" + "=" * 55)
|
||||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
||||
metric_names = [
|
||||
"MdAPE (%)",
|
||||
"% within 10%",
|
||||
"% within 20%",
|
||||
"% within 30%",
|
||||
"MAE (£)",
|
||||
"Mean signed error (£)",
|
||||
"n",
|
||||
]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>14s}"
|
||||
print(header)
|
||||
print("-" * 55)
|
||||
|
||||
for metric in metric_names:
|
||||
row = f"{metric:<25s}"
|
||||
for stage in stages:
|
||||
val = metrics_by_stage[stage][metric]
|
||||
if metric == "n":
|
||||
row += f" {val:>14,d}"
|
||||
elif "£" in metric:
|
||||
row += f" {val:>13,.0f}"
|
||||
else:
|
||||
row += f" {val:>13.1f}%"
|
||||
print(row)
|
||||
|
||||
print("=" * 55)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backtest price estimation model")
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output backtest_results.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hedonic-model",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Path to hedonic_model.json (optional)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(
|
||||
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
|
||||
)
|
||||
|
||||
has_hedonic = args.hedonic_model is not None
|
||||
test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)
|
||||
|
||||
print("\nPredicting with price index...")
|
||||
test = predict(test, index)
|
||||
|
||||
# Compute and print metrics
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
metrics = {
|
||||
"Naive": compute_metrics(
|
||||
actual, test["input_price"].to_numpy().astype(np.float64)
|
||||
),
|
||||
"Index": compute_metrics(
|
||||
actual, test["predicted"].to_numpy().astype(np.float64)
|
||||
),
|
||||
}
|
||||
|
||||
# Hedonic blending
|
||||
if has_hedonic:
|
||||
print("\nApplying hedonic blending...")
|
||||
with open(args.hedonic_model) as f:
|
||||
model = json.load(f)
|
||||
type_models = model["type_models"]
|
||||
|
||||
# Identify eligible rows for hedonic estimate
|
||||
hedonic_mask = (
|
||||
pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& pl.col("type_group").is_not_null()
|
||||
)
|
||||
eligible_mask = test.select(hedonic_mask).to_series()
|
||||
eligible = test.filter(eligible_mask)
|
||||
|
||||
if len(eligible) > 0:
|
||||
log_fa = np.log(
|
||||
np.maximum(
|
||||
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
|
||||
1.0,
|
||||
)
|
||||
)
|
||||
sectors = eligible["sector"].to_list()
|
||||
types = eligible["type_group"].to_list()
|
||||
|
||||
# Per-type hedonic prediction
|
||||
log_hedonic = np.empty(len(eligible))
|
||||
for i in range(len(eligible)):
|
||||
tm = type_models.get(types[i])
|
||||
if tm is None:
|
||||
log_hedonic[i] = np.nan
|
||||
continue
|
||||
alpha = tm["sector_intercepts"].get(
|
||||
sectors[i], tm["national_intercept"]
|
||||
)
|
||||
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
|
||||
|
||||
valid = np.isfinite(log_hedonic)
|
||||
|
||||
# Hold years: input_year to actual_year (simulating real prediction)
|
||||
input_years = eligible["input_year"].to_numpy().astype(np.float64)
|
||||
actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
|
||||
hold_years = np.maximum(actual_years - input_years, 0.0)
|
||||
|
||||
log_index_pred = np.log(
|
||||
np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
|
||||
)
|
||||
|
||||
# Sweep tau values (only on valid hedonic rows)
|
||||
tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
|
||||
actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
|
||||
best_tau = 15.0
|
||||
best_mdape = float("inf")
|
||||
|
||||
print(f"\n tau sweep ({valid.sum():,} eligible properties):")
|
||||
for tau in tau_values:
|
||||
blend_w = hold_years / (hold_years + tau)
|
||||
log_blended = np.where(
|
||||
valid,
|
||||
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
|
||||
log_index_pred,
|
||||
)
|
||||
blended = np.exp(log_blended)
|
||||
m = compute_metrics(actual_eligible, blended)
|
||||
marker = ""
|
||||
if m["MdAPE (%)"] < best_mdape:
|
||||
best_mdape = m["MdAPE (%)"]
|
||||
best_tau = tau
|
||||
marker = " <-- best"
|
||||
print(
|
||||
f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
|
||||
f"within 10%={m['% within 10%']:>5.1f}%{marker}"
|
||||
)
|
||||
|
||||
print(f"\n Best tau = {best_tau}")
|
||||
|
||||
# Compute blended predictions with best tau for full test set
|
||||
blend_w = hold_years / (hold_years + best_tau)
|
||||
log_blended = np.where(
|
||||
valid,
|
||||
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
|
||||
log_index_pred,
|
||||
)
|
||||
blended_eligible = np.exp(log_blended)
|
||||
|
||||
# Merge back: for non-eligible rows, use index prediction
|
||||
blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
|
||||
eligible_indices = eligible_mask.arg_true()
|
||||
for i, idx in enumerate(eligible_indices):
|
||||
blended_all[idx] = blended_eligible[i]
|
||||
|
||||
test = test.with_columns(
|
||||
pl.Series("blended", blended_all, dtype=pl.Float64),
|
||||
)
|
||||
metrics["Blended"] = compute_metrics(actual, blended_all)
|
||||
|
||||
print_metrics_table(metrics)
|
||||
|
||||
# Save results
|
||||
result_cols = [
|
||||
"Postcode",
|
||||
"sector",
|
||||
"input_year",
|
||||
"input_price",
|
||||
"actual_year",
|
||||
"actual_price",
|
||||
"predicted",
|
||||
]
|
||||
if "blended" in test.columns:
|
||||
result_cols.append("blended")
|
||||
result = test.select(result_cols)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,414 +0,0 @@
|
|||
"""Augment wide.parquet with an estimated current price column.
|
||||
|
||||
Joins the precomputed repeat-sales price index (from price_index.py) with each
|
||||
property's last known sale to produce an inflation-adjusted current price estimate.
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Optionally applies renovation premiums from renovation_premium.py: for properties
|
||||
with post-sale renovation events, the estimated price is adjusted upward based on
|
||||
data-driven per-area premiums with time decay.
|
||||
|
||||
Modifies wide.parquet in-place, adding the "Estimated current price" column.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
HALF_LIFE = 10.0
|
||||
DECAY_RATE = math.log(2) / HALF_LIFE
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Augment wide.parquet with estimated current prices"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to wide.parquet (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--renovation-premium",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Path to renovation_premium.parquet (optional)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hedonic-model",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Path to hedonic_model.json (optional)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Derive helper columns for the join
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
& pl.col("Date of last transaction").is_not_null()
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
sector_expr().alias("_sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
)
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(
|
||||
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
|
||||
)
|
||||
|
||||
print("\nApplying repeat-sales index...")
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join type-specific index at sale year
|
||||
df = df.join(
|
||||
idx_typed.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"year",
|
||||
pl.col("log_index").alias("log_idx_sale_typed"),
|
||||
),
|
||||
left_on=["_sector", "_type_group", "_sale_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at sale year
|
||||
df = df.join(
|
||||
idx_all.select(
|
||||
"sector", "year", pl.col("log_index").alias("log_idx_sale_all")
|
||||
),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at current year
|
||||
df = df.join(
|
||||
idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
|
||||
"sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
|
||||
),
|
||||
left_on=["_sector", "_type_group"],
|
||||
right_on=["sector", "type_group"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at current year
|
||||
df = df.join(
|
||||
idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
|
||||
"sector", pl.col("log_index").alias("log_idx_cur_all")
|
||||
),
|
||||
left_on="_sector",
|
||||
right_on="sector",
|
||||
how="left",
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("log_idx_sale_typed")
|
||||
.fill_null(pl.col("log_idx_sale_all"))
|
||||
.alias("_log_index_sale"),
|
||||
pl.col("log_idx_cur_typed")
|
||||
.fill_null(pl.col("log_idx_cur_all"))
|
||||
.alias("_log_index_current"),
|
||||
)
|
||||
else:
|
||||
df = df.join(
|
||||
index.select(
|
||||
"sector", "year", pl.col("log_index").alias("_log_index_sale")
|
||||
),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
|
||||
"sector", pl.col("log_index").alias("_log_index_current")
|
||||
)
|
||||
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
|
||||
|
||||
# Compute estimate — only for rows with a known price
|
||||
df = df.with_columns(
|
||||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
|
||||
n_with_price = df.filter(has_price).height
|
||||
print(
|
||||
f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# Apply hedonic blending if model provided
|
||||
if args.hedonic_model is not None:
|
||||
print("\nApplying hedonic blending...")
|
||||
with open(args.hedonic_model) as f:
|
||||
model = json.load(f)
|
||||
type_models = model["type_models"]
|
||||
tau = model.get("tau", 15.0)
|
||||
print(f" tau = {tau}, {len(type_models)} type models")
|
||||
|
||||
# Add type_group for per-type lookup
|
||||
df = df.with_columns(type_group_expr())
|
||||
hedonic_mask = (
|
||||
has_price
|
||||
& pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& pl.col("type_group").is_not_null()
|
||||
)
|
||||
eligible = df.filter(hedonic_mask)
|
||||
|
||||
if len(eligible) > 0:
|
||||
log_fa = np.log(
|
||||
np.maximum(
|
||||
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
|
||||
1.0,
|
||||
)
|
||||
)
|
||||
sectors = eligible["_sector"].to_list()
|
||||
types = eligible["type_group"].to_list()
|
||||
|
||||
# Per-type hedonic prediction
|
||||
log_hedonic = np.empty(len(eligible))
|
||||
for i in range(len(eligible)):
|
||||
tm = type_models.get(types[i])
|
||||
if tm is None:
|
||||
log_hedonic[i] = np.nan
|
||||
continue
|
||||
alpha = tm["sector_intercepts"].get(
|
||||
sectors[i], tm["national_intercept"]
|
||||
)
|
||||
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
|
||||
|
||||
valid = np.isfinite(log_hedonic)
|
||||
|
||||
# Hold years and blend weight
|
||||
sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
|
||||
hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
|
||||
blend_w = hold_years / (hold_years + tau)
|
||||
|
||||
# Blend in log space
|
||||
log_index_est = np.log(
|
||||
eligible["Estimated current price"].to_numpy().astype(np.float64)
|
||||
)
|
||||
log_blended = np.where(
|
||||
valid,
|
||||
(1 - blend_w) * log_index_est + blend_w * log_hedonic,
|
||||
log_index_est,
|
||||
)
|
||||
blended_prices = np.exp(log_blended)
|
||||
|
||||
# Write back into df
|
||||
eligible_indices = df.select(hedonic_mask).to_series().arg_true()
|
||||
price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
|
||||
for i, idx in enumerate(eligible_indices):
|
||||
price_arr[idx] = blended_prices[i]
|
||||
df = df.with_columns(
|
||||
pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
n_blended = int(valid.sum())
|
||||
avg_w = float(np.mean(blend_w[valid]))
|
||||
print(
|
||||
f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
|
||||
)
|
||||
else:
|
||||
print(" No eligible properties for hedonic blending")
|
||||
|
||||
# Apply renovation premiums if provided
|
||||
if args.renovation_premium is not None:
|
||||
print("\nApplying renovation premiums...")
|
||||
reno_prem = pl.read_parquet(args.renovation_premium)
|
||||
print(f" Loaded {len(reno_prem):,} premium rows")
|
||||
|
||||
# Find properties with post-sale renovation events
|
||||
has_reno = (
|
||||
pl.col("renovation_history").is_not_null()
|
||||
& (pl.col("renovation_history").list.len() > 0)
|
||||
& pl.col("Estimated current price").is_not_null()
|
||||
)
|
||||
|
||||
# Explode renovation events, filter to post-sale only
|
||||
reno_rows = (
|
||||
df.lazy()
|
||||
.filter(has_reno)
|
||||
.select("_sector", "_type_group", "_sale_year", "renovation_history")
|
||||
.with_row_index("_row_idx")
|
||||
.explode("renovation_history")
|
||||
.with_columns(
|
||||
pl.col("renovation_history").struct.field("year").alias("_event_year"),
|
||||
pl.col("renovation_history").struct.field("event").alias("_event_type"),
|
||||
)
|
||||
.filter(pl.col("_event_year") > pl.col("_sale_year"))
|
||||
.collect()
|
||||
)
|
||||
|
||||
if len(reno_rows) > 0:
|
||||
# Take most recent event per (row, event_type)
|
||||
latest = (
|
||||
reno_rows.lazy()
|
||||
.group_by("_row_idx", "_event_type", "_sector", "_type_group")
|
||||
.agg(pl.col("_event_year").max().alias("_event_year"))
|
||||
.collect()
|
||||
)
|
||||
|
||||
# Compute time-decayed premium
|
||||
latest = latest.with_columns(
|
||||
(-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
|
||||
.exp()
|
||||
.alias("_decay"),
|
||||
)
|
||||
|
||||
# Join with renovation_premium.parquet — try typed first, fall back to "All"
|
||||
rp_typed = reno_prem.filter(pl.col("type_group") != "All")
|
||||
rp_all = reno_prem.filter(pl.col("type_group") == "All")
|
||||
|
||||
latest = (
|
||||
latest.join(
|
||||
rp_typed.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"event_type",
|
||||
pl.col("log_premium").alias("_lp_typed"),
|
||||
),
|
||||
left_on=["_sector", "_type_group", "_event_type"],
|
||||
right_on=["sector", "type_group", "event_type"],
|
||||
how="left",
|
||||
)
|
||||
.join(
|
||||
rp_all.select(
|
||||
"sector", "event_type", pl.col("log_premium").alias("_lp_all")
|
||||
),
|
||||
left_on=["_sector", "_event_type"],
|
||||
right_on=["sector", "event_type"],
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("_lp_typed")
|
||||
.fill_null(pl.col("_lp_all"))
|
||||
.fill_null(0.0)
|
||||
.alias("_log_premium"),
|
||||
)
|
||||
)
|
||||
|
||||
# Compute total decayed log premium per property
|
||||
per_property = (
|
||||
latest.lazy()
|
||||
.with_columns(
|
||||
(pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
|
||||
)
|
||||
.group_by("_row_idx")
|
||||
.agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
|
||||
.collect()
|
||||
)
|
||||
|
||||
# We need to map _row_idx back to the main df. Re-derive the row indices.
|
||||
# _row_idx was generated from filtered rows — we need the actual df row indices.
|
||||
reno_mask = df.select(has_reno).to_series()
|
||||
actual_indices = reno_mask.arg_true()
|
||||
|
||||
# Build a mapping: _row_idx -> actual df row
|
||||
idx_map = per_property.with_columns(
|
||||
pl.col("_row_idx")
|
||||
.map_elements(
|
||||
lambda i: int(actual_indices[i]),
|
||||
return_dtype=pl.UInt32,
|
||||
)
|
||||
.alias("_df_row"),
|
||||
)
|
||||
|
||||
# Create a full-length column of zeros, then fill in premium values
|
||||
reno_log_prem = [0.0] * len(df)
|
||||
for row in idx_map.iter_rows(named=True):
|
||||
reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
|
||||
|
||||
df = df.with_columns(
|
||||
pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
# Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
|
||||
df = df.with_columns(
|
||||
pl.when(pl.col("_reno_log_premium") != 0.0)
|
||||
.then(
|
||||
pl.col("Estimated current price")
|
||||
* pl.col("_reno_log_premium").exp()
|
||||
)
|
||||
.otherwise(pl.col("Estimated current price"))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_with_premium = idx_map.height
|
||||
avg_multiplier = math.exp(
|
||||
per_property["_reno_log_premium"]
|
||||
.filter(per_property["_reno_log_premium"] != 0.0)
|
||||
.mean()
|
||||
)
|
||||
print(f" {n_with_premium:,} properties with renovation premium applied")
|
||||
print(
|
||||
f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
|
||||
)
|
||||
else:
|
||||
print(" No properties with post-sale renovation events")
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
# Drop all temporary columns
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
# Also drop hedonic-derived column if it was added
|
||||
if "type_group" in df.columns:
|
||||
temp_cols.append("type_group")
|
||||
df = df.drop(temp_cols)
|
||||
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,523 +0,0 @@
|
|||
"""Repeat-Sales Price Index (improved)
|
||||
|
||||
Builds a hierarchical repeat-sales price index with:
|
||||
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
|
||||
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
|
||||
3. National hedonic time-dummy model as ultimate shrinkage fallback
|
||||
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
|
||||
|
||||
Output: price_index.parquet — sector × type_group × year → log_index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from scipy.spatial import KDTree
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
SHRINKAGE_K,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
hierarchy_keys,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
# --- Constants ---
|
||||
MIN_PAIRS = 5
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
|
||||
|
||||
# --- Pair extraction ---
|
||||
|
||||
|
||||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||||
print("Extracting repeat-sale pairs...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
pl.col("historical_prices")
|
||||
.list.slice(0, pl.col("historical_prices").list.len() - 1)
|
||||
.alias("from_txn"),
|
||||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||||
)
|
||||
.explode("from_txn", "to_txn")
|
||||
.with_columns(
|
||||
pl.col("from_txn").struct.field("year").alias("year1"),
|
||||
pl.col("from_txn").struct.field("price").alias("price1"),
|
||||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select("sector", "type_group", "year1", "price1", "year2", "price2")
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
|
||||
.log()
|
||||
.alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
|
||||
"weight"
|
||||
),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
|
||||
# Add hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
print(f" {len(pairs):,} pairs extracted")
|
||||
return pairs
|
||||
|
||||
|
||||
# --- Robust IRLS solver ---
|
||||
|
||||
|
||||
def solve_robust_index(
|
||||
years1: np.ndarray,
|
||||
years2: np.ndarray,
|
||||
log_ratios: np.ndarray,
|
||||
base_weights: np.ndarray,
|
||||
) -> dict[int, float]:
|
||||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||||
n = len(years1)
|
||||
if n < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
all_years = np.union1d(years1, years2)
|
||||
min_year = int(all_years.min())
|
||||
|
||||
col = 0
|
||||
year_to_col = {}
|
||||
for y in all_years:
|
||||
iy = int(y)
|
||||
if iy != min_year:
|
||||
year_to_col[iy] = col
|
||||
col += 1
|
||||
n_cols = len(year_to_col)
|
||||
if n_cols == 0:
|
||||
return {}
|
||||
|
||||
# Vectorized column index mapping
|
||||
col2 = np.full(n, -1, dtype=np.int32)
|
||||
col1 = np.full(n, -1, dtype=np.int32)
|
||||
for year, c in year_to_col.items():
|
||||
col2[years2 == year] = c
|
||||
col1[years1 == year] = c
|
||||
|
||||
# Sparse matrix structure (fixed across iterations)
|
||||
mask2 = col2 >= 0
|
||||
mask1 = col1 >= 0
|
||||
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
|
||||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
weights = base_weights.copy()
|
||||
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
predicted = np.zeros(n)
|
||||
predicted[mask2] += betas[col2[mask2]]
|
||||
predicted[mask1] -= betas[col1[mask1]]
|
||||
residuals = log_ratios - predicted
|
||||
|
||||
# Huber reweighting
|
||||
abs_r = np.abs(residuals)
|
||||
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
|
||||
weights = base_weights * huber_w
|
||||
|
||||
index = {min_year: 0.0}
|
||||
for year, c in year_to_col.items():
|
||||
index[year] = float(betas[c])
|
||||
return index
|
||||
|
||||
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"),
|
||||
pl.col("year2"),
|
||||
pl.col("log_ratio"),
|
||||
pl.col("weight"),
|
||||
)
|
||||
indices = {}
|
||||
n_pairs = {}
|
||||
for row in tqdm(
|
||||
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
|
||||
):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||||
w = np.array(row["weight"], dtype=np.float64)
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs[key] = len(y1)
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
# --- Hedonic model ---
|
||||
|
||||
|
||||
def compute_hedonic_index(
|
||||
input_path: Path, min_year: int, max_year: int
|
||||
) -> dict[int, float]:
|
||||
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
|
||||
print("Computing hedonic index...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"Property type",
|
||||
"Total floor area (sqm)",
|
||||
)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
pl.col("sale_year").is_not_null(),
|
||||
pl.col("sale_year") >= min_year,
|
||||
pl.col("sale_year") <= max_year,
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} complete cases for hedonic model")
|
||||
|
||||
# Target
|
||||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||||
sale_years = df["sale_year"].to_numpy()
|
||||
|
||||
# Build feature matrix (18 hedonic features + intercept)
|
||||
X = build_hedonic_features(df)
|
||||
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
|
||||
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
|
||||
|
||||
# Step 1: regress log(price) on features → quality score
|
||||
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
|
||||
quality_score = F.astype(np.float64) @ betas
|
||||
residuals = log_price - quality_score
|
||||
|
||||
# Step 2: average residual by year = hedonic index
|
||||
hedonic = {}
|
||||
for y in range(min_year, max_year + 1):
|
||||
mask = sale_years == y
|
||||
if mask.sum() > 0:
|
||||
hedonic[y] = float(np.mean(residuals[mask]))
|
||||
|
||||
# Normalize: min_year = 0
|
||||
base = hedonic.get(min_year, 0.0)
|
||||
for y in hedonic:
|
||||
hedonic[y] -= base
|
||||
|
||||
print(
|
||||
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
|
||||
)
|
||||
return hedonic
|
||||
|
||||
|
||||
# --- Shrinkage ---
|
||||
|
||||
|
||||
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
|
||||
w = n_pairs / (n_pairs + k)
|
||||
result = {}
|
||||
for y in set(raw) | set(parent):
|
||||
r = raw.get(y, parent.get(y, 0.0))
|
||||
p = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * r + (1 - w) * p
|
||||
return result
|
||||
|
||||
|
||||
def apply_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
district_idx,
|
||||
district_n,
|
||||
area_idx,
|
||||
area_n,
|
||||
national_idx,
|
||||
national_n,
|
||||
hedonic_idx,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
):
|
||||
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
|
||||
# National → hedonic
|
||||
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
|
||||
|
||||
# Area → national
|
||||
area_shrunk = {}
|
||||
for area, idx in area_idx.items():
|
||||
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
|
||||
|
||||
# District → area
|
||||
district_shrunk = {}
|
||||
for dist, idx in district_idx.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, national_shrunk)
|
||||
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
|
||||
|
||||
# Sector → district
|
||||
sector_shrunk = {}
|
||||
for sec, idx in sector_idx.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, national_shrunk)
|
||||
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own index
|
||||
for sec in all_sectors:
|
||||
if sec not in sector_shrunk:
|
||||
d = sector_to_dist.get(sec, "")
|
||||
a = dist_to_area.get(d, "")
|
||||
sector_shrunk[sec] = district_shrunk.get(
|
||||
d, area_shrunk.get(a, national_shrunk)
|
||||
)
|
||||
|
||||
return sector_shrunk
|
||||
|
||||
|
||||
# --- Spatial smoothing ---
|
||||
|
||||
|
||||
def spatial_smooth(
|
||||
sector_indices: dict,
|
||||
centroids: dict,
|
||||
n_pairs_map: dict,
|
||||
) -> dict:
|
||||
"""Blend sparse sector indices with K nearest neighbors."""
|
||||
# Build coordinate arrays for sectors with centroids
|
||||
sectors_with_coords = [s for s in sector_indices if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_indices
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_indices)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = n_pairs_map.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_indices = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_indices:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_indices.append(sector_indices[ns])
|
||||
|
||||
if not neighbor_indices:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||||
|
||||
blended = {}
|
||||
all_years = set(sector_indices[sec])
|
||||
for ni in neighbor_indices:
|
||||
all_years |= set(ni)
|
||||
for y in all_years:
|
||||
val = self_w * sector_indices[sec].get(y, 0.0)
|
||||
for ni, w in zip(neighbor_indices, ws):
|
||||
val += w * ni.get(y, 0.0)
|
||||
blended[y] = val
|
||||
result[sec] = blended
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- Forward fill ---
|
||||
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
filled = {}
|
||||
last = 0.0
|
||||
for y in range(min_year, max_year + 1):
|
||||
if y in index:
|
||||
last = index[y]
|
||||
filled[y] = last
|
||||
return filled
|
||||
|
||||
|
||||
# --- Main ---
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build improved repeat-sales price index"
|
||||
)
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
pairs = extract_pairs(args.input)
|
||||
centroids = extract_centroids(args.input)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
|
||||
|
||||
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
# Process each type group + "All"
|
||||
all_type_groups = ["All"] + TYPE_GROUPS
|
||||
final = {} # {type_group: {sector: {year: log_index}}}
|
||||
final_n = {} # {type_group: {sector: n_pairs}}
|
||||
|
||||
for tg in all_type_groups:
|
||||
print(f"\n--- {tg} ---")
|
||||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||||
if len(typed) < MIN_PAIRS:
|
||||
print(f" Skipping (only {len(typed)} pairs)")
|
||||
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
|
||||
final_n[tg] = {s: 0 for s in all_sectors}
|
||||
continue
|
||||
|
||||
print(f" {len(typed):,} pairs")
|
||||
|
||||
# National
|
||||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_robust_index(
|
||||
np_arrs["year1"].to_numpy(),
|
||||
np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(),
|
||||
np_arrs["weight"].to_numpy(),
|
||||
)
|
||||
national_n = len(typed)
|
||||
print(f" National: {len(national_idx)} years")
|
||||
|
||||
# Area, district, sector
|
||||
print(" Computing per-level indices:")
|
||||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||||
print(
|
||||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Shrinkage
|
||||
print(" Applying shrinkage...")
|
||||
sector_shrunk = apply_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
district_idx,
|
||||
district_n,
|
||||
area_idx,
|
||||
area_n,
|
||||
national_idx,
|
||||
national_n,
|
||||
hedonic_idx,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
print(" Spatial smoothing...")
|
||||
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
|
||||
|
||||
# Forward fill
|
||||
for sec in all_sectors:
|
||||
sector_smoothed[sec] = forward_fill(
|
||||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||||
)
|
||||
|
||||
final[tg] = sector_smoothed
|
||||
final_n[tg] = sector_n
|
||||
|
||||
# Assemble output
|
||||
print("\nAssembling output...")
|
||||
rows = []
|
||||
for tg in all_type_groups:
|
||||
for sec in all_sectors:
|
||||
n = final_n[tg].get(sec, 0)
|
||||
for year, log_idx in final[tg][sec].items():
|
||||
rows.append((sec, tg, year, log_idx, n))
|
||||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={
|
||||
"sector": pl.String,
|
||||
"type_group": pl.String,
|
||||
"year": pl.Int32,
|
||||
"log_index": pl.Float64,
|
||||
"n_pairs": pl.Int64,
|
||||
},
|
||||
orient="row",
|
||||
).sort("type_group", "sector", "year")
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,572 +0,0 @@
|
|||
"""Estimate per-area renovation premiums from repeat-sale residuals.
|
||||
|
||||
For each repeat-sale pair, computes the residual after removing the price-index
|
||||
predicted return. Pairs where renovation events occurred between sales should have
|
||||
systematically higher residuals. A WLS regression estimates the log-premium per
|
||||
event type, with hierarchical shrinkage and spatial smoothing.
|
||||
|
||||
Output: renovation_premium.parquet — sector × type_group × event_type → log_premium
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import KDTree
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
SHRINKAGE_K,
|
||||
TYPE_GROUPS,
|
||||
extract_centroids,
|
||||
hierarchy_keys,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
HALF_LIFE = 10.0
|
||||
DECAY_RATE = math.log(2) / HALF_LIFE
|
||||
OUTLIER_THRESHOLD = 3.0
|
||||
MIN_PAIRS = 10
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
EVENT_TYPES = ["Extension", "Renovation", "Remodeling"]
|
||||
|
||||
|
||||
def extract_pairs_with_events(input_path: Path, index_path: Path) -> pl.DataFrame:
|
||||
"""Extract repeat-sale pairs with renovation events and index residuals."""
|
||||
print("Extracting repeat-sale pairs with renovation events...")
|
||||
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices", "Property type", "renovation_history")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
# Build consecutive pairs
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
pl.col("historical_prices")
|
||||
.list.slice(0, pl.col("historical_prices").list.len() - 1)
|
||||
.alias("from_txn"),
|
||||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||||
)
|
||||
.explode("from_txn", "to_txn")
|
||||
.with_columns(
|
||||
pl.col("from_txn").struct.field("year").alias("year1"),
|
||||
pl.col("from_txn").struct.field("price").alias("price1"),
|
||||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"year1",
|
||||
"price1",
|
||||
"year2",
|
||||
"price2",
|
||||
"renovation_history",
|
||||
)
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
|
||||
.log()
|
||||
.alias("log_ratio"),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(pairs):,} repeat-sale pairs")
|
||||
|
||||
# Join price index to compute residuals
|
||||
index = pl.read_parquet(index_path)
|
||||
has_type_group = "type_group" in index.columns
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join at year1
|
||||
pairs = pairs.join(
|
||||
idx_typed.select(
|
||||
"sector", "type_group", "year", pl.col("log_index").alias("li1_typed")
|
||||
),
|
||||
left_on=["sector", "type_group", "year1"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
).join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li1_all")),
|
||||
left_on=["sector", "year1"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join at year2
|
||||
pairs = pairs.join(
|
||||
idx_typed.select(
|
||||
"sector", "type_group", "year", pl.col("log_index").alias("li2_typed")
|
||||
),
|
||||
left_on=["sector", "type_group", "year2"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
).join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li2_all")),
|
||||
left_on=["sector", "year2"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
pairs = pairs.with_columns(
|
||||
(pl.col("li1_typed").fill_null(pl.col("li1_all"))).alias("_li1"),
|
||||
(pl.col("li2_typed").fill_null(pl.col("li2_all"))).alias("_li2"),
|
||||
)
|
||||
else:
|
||||
pairs = pairs.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("_li1")),
|
||||
left_on=["sector", "year1"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
).join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("_li2")),
|
||||
left_on=["sector", "year2"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Compute residual = log_ratio - (index2 - index1)
|
||||
pairs = pairs.with_columns(
|
||||
(
|
||||
pl.col("log_ratio")
|
||||
- (pl.col("_li2").fill_null(0.0) - pl.col("_li1").fill_null(0.0))
|
||||
).alias("residual"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
|
||||
"weight"
|
||||
),
|
||||
)
|
||||
|
||||
# For each pair, compute time-decayed renovation indicators
|
||||
# Use row index for unique identification (composite keys aren't unique per pair)
|
||||
pairs = pairs.with_row_index("_pair_idx")
|
||||
|
||||
for et in EVENT_TYPES:
|
||||
col_name = f"has_{et.lower()}"
|
||||
pairs = pairs.with_columns(pl.lit(0.0).alias(col_name))
|
||||
|
||||
# Process properties that have renovation history
|
||||
has_reno = pairs.filter(
|
||||
pl.col("renovation_history").is_not_null()
|
||||
& (pl.col("renovation_history").list.len() > 0)
|
||||
)
|
||||
|
||||
if len(has_reno) > 0:
|
||||
reno_exploded = (
|
||||
has_reno.select("_pair_idx", "year1", "year2", "renovation_history")
|
||||
.explode("renovation_history")
|
||||
.with_columns(
|
||||
pl.col("renovation_history").struct.field("year").alias("event_year"),
|
||||
pl.col("renovation_history").struct.field("event").alias("event_type"),
|
||||
)
|
||||
# Only events between the two sales
|
||||
.filter(
|
||||
(pl.col("event_year") > pl.col("year1"))
|
||||
& (pl.col("event_year") <= pl.col("year2"))
|
||||
)
|
||||
)
|
||||
|
||||
if len(reno_exploded) > 0:
|
||||
# For each pair + event type, take the most recent event
|
||||
latest_events = reno_exploded.group_by(
|
||||
"_pair_idx", "event_type", "year2"
|
||||
).agg(pl.col("event_year").max().alias("latest_event_year"))
|
||||
|
||||
# Compute time-decayed indicator: exp(-decay_rate * (year2 - event_year))
|
||||
latest_events = latest_events.with_columns(
|
||||
(
|
||||
-DECAY_RATE
|
||||
* (pl.col("year2") - pl.col("latest_event_year")).cast(pl.Float64)
|
||||
)
|
||||
.exp()
|
||||
.alias("decayed_indicator"),
|
||||
)
|
||||
|
||||
# Pivot to wide format using _pair_idx for unique join
|
||||
for et in EVENT_TYPES:
|
||||
et_data = latest_events.filter(pl.col("event_type") == et)
|
||||
if len(et_data) > 0:
|
||||
col_name = f"has_{et.lower()}"
|
||||
pairs = (
|
||||
pairs.join(
|
||||
et_data.select(
|
||||
"_pair_idx",
|
||||
pl.col("decayed_indicator").alias(f"_{col_name}"),
|
||||
),
|
||||
on="_pair_idx",
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.col(f"_{col_name}").fill_null(0.0).alias(col_name),
|
||||
)
|
||||
.drop(f"_{col_name}")
|
||||
)
|
||||
|
||||
pairs = pairs.drop("_pair_idx")
|
||||
|
||||
# Add hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
# Count reno pairs
|
||||
reno_mask = (
|
||||
(pl.col("has_extension") > 0)
|
||||
| (pl.col("has_renovation") > 0)
|
||||
| (pl.col("has_remodeling") > 0)
|
||||
)
|
||||
n_reno = pairs.filter(reno_mask).height
|
||||
print(
|
||||
f" {n_reno:,} pairs with renovation events ({n_reno / len(pairs) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# Drop temporary columns from index join + renovation_history (no longer needed)
|
||||
temp_cols = [
|
||||
c
|
||||
for c in pairs.columns
|
||||
if c.startswith("_li") or c.startswith("li1_") or c.startswith("li2_")
|
||||
]
|
||||
pairs = pairs.drop(temp_cols + ["renovation_history"])
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def wls_regression(
|
||||
residuals: np.ndarray,
|
||||
weights: np.ndarray,
|
||||
X: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
"""Weighted least squares: residual ~ X (with intercept column in X).
|
||||
|
||||
Uses sqrt(weights) scaling to avoid building a full N×N diagonal matrix.
|
||||
"""
|
||||
sqrt_w = np.sqrt(weights)[:, np.newaxis]
|
||||
Xw = X * sqrt_w
|
||||
yw = residuals * sqrt_w.ravel()
|
||||
try:
|
||||
betas = np.linalg.lstsq(Xw, yw, rcond=None)[0]
|
||||
except np.linalg.LinAlgError:
|
||||
betas = np.zeros(X.shape[1])
|
||||
return betas
|
||||
|
||||
|
||||
def compute_premiums_for_group(df: pl.DataFrame) -> dict[str, float]:
|
||||
"""Run WLS regression for a group, return {event_type: log_premium}."""
|
||||
n = len(df)
|
||||
if n < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
residuals = df["residual"].to_numpy().astype(np.float64)
|
||||
weights = df["weight"].to_numpy().astype(np.float64)
|
||||
|
||||
# Build design matrix: intercept + 3 event indicators
|
||||
X = np.column_stack(
|
||||
[
|
||||
np.ones(n),
|
||||
df["has_extension"].to_numpy().astype(np.float64),
|
||||
df["has_renovation"].to_numpy().astype(np.float64),
|
||||
df["has_remodeling"].to_numpy().astype(np.float64),
|
||||
]
|
||||
)
|
||||
|
||||
# Check if we have any renovation pairs in this group
|
||||
reno_sum = X[:, 1:].sum()
|
||||
if reno_sum < 1.0:
|
||||
return {}
|
||||
|
||||
betas = wls_regression(residuals, weights, X)
|
||||
# betas[0] is intercept, betas[1:4] are the premiums
|
||||
return {
|
||||
"Extension": float(betas[1]),
|
||||
"Renovation": float(betas[2]),
|
||||
"Remodeling": float(betas[3]),
|
||||
}
|
||||
|
||||
|
||||
def compute_premiums_for_level(
|
||||
pairs: pl.DataFrame, group_col: str
|
||||
) -> tuple[dict, dict]:
|
||||
"""Compute premiums per group at a given hierarchy level.
|
||||
|
||||
Returns (premiums, n_reno_pairs) dicts keyed by group value.
|
||||
premiums[key] = {event_type: log_premium}
|
||||
"""
|
||||
groups = pairs.group_by(group_col)
|
||||
premiums = {}
|
||||
n_reno_pairs = {}
|
||||
for key, group_df in groups:
|
||||
key_val = key[0]
|
||||
result = compute_premiums_for_group(group_df)
|
||||
if result:
|
||||
premiums[key_val] = result
|
||||
# Count pairs with any reno indicator
|
||||
reno_mask = (
|
||||
(group_df["has_extension"].to_numpy() > 0)
|
||||
| (group_df["has_renovation"].to_numpy() > 0)
|
||||
| (group_df["has_remodeling"].to_numpy() > 0)
|
||||
)
|
||||
n_reno_pairs[key_val] = int(reno_mask.sum())
|
||||
return premiums, n_reno_pairs
|
||||
|
||||
|
||||
def shrink_premium(
|
||||
raw: dict[str, float], parent: dict[str, float], n: int
|
||||
) -> dict[str, float]:
|
||||
"""Shrink raw premiums toward parent level."""
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
result = {}
|
||||
for et in EVENT_TYPES:
|
||||
r = raw.get(et, parent.get(et, 0.0))
|
||||
p = parent.get(et, raw.get(et, 0.0))
|
||||
result[et] = w * r + (1 - w) * p
|
||||
return result
|
||||
|
||||
|
||||
def apply_shrinkage(
|
||||
sector_prem,
|
||||
sector_n,
|
||||
district_prem,
|
||||
district_n,
|
||||
area_prem,
|
||||
area_n,
|
||||
national_prem,
|
||||
national_n,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
):
|
||||
"""Top-down hierarchical shrinkage for premiums."""
|
||||
# Area -> national
|
||||
area_shrunk = {}
|
||||
for area, prem in area_prem.items():
|
||||
area_shrunk[area] = shrink_premium(prem, national_prem, area_n.get(area, 0))
|
||||
|
||||
# District -> area
|
||||
district_shrunk = {}
|
||||
for dist, prem in district_prem.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, national_prem)
|
||||
district_shrunk[dist] = shrink_premium(prem, parent, district_n.get(dist, 0))
|
||||
|
||||
# Sector -> district
|
||||
sector_shrunk = {}
|
||||
for sec, prem in sector_prem.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, national_prem)
|
||||
sector_shrunk[sec] = shrink_premium(prem, parent, sector_n.get(sec, 0))
|
||||
|
||||
# Fill missing sectors
|
||||
for sec in all_sectors:
|
||||
if sec not in sector_shrunk:
|
||||
d = sector_to_dist.get(sec, "")
|
||||
a = dist_to_area.get(d, "")
|
||||
sector_shrunk[sec] = district_shrunk.get(
|
||||
d, area_shrunk.get(a, national_prem)
|
||||
)
|
||||
|
||||
return sector_shrunk
|
||||
|
||||
|
||||
def spatial_smooth(
|
||||
sector_premiums: dict[str, dict[str, float]],
|
||||
centroids: dict[str, tuple[float, float]],
|
||||
n_reno_map: dict[str, int],
|
||||
) -> dict[str, dict[str, float]]:
|
||||
"""Blend sparse sector premiums with K nearest neighbors."""
|
||||
sectors_with_coords = [s for s in sector_premiums if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_premiums
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_premiums)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = n_reno_map.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_prems = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_premiums:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_prems.append(sector_premiums[ns])
|
||||
|
||||
if not neighbor_prems:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||||
|
||||
blended = {}
|
||||
for et in EVENT_TYPES:
|
||||
val = self_w * sector_premiums[sec].get(et, 0.0)
|
||||
for np_dict, w in zip(neighbor_prems, ws):
|
||||
val += w * np_dict.get(et, 0.0)
|
||||
blended[et] = val
|
||||
result[sec] = blended
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Estimate renovation premiums from repeat-sale residuals"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output renovation_premium.parquet"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pairs = extract_pairs_with_events(args.input, args.index)
|
||||
centroids = extract_centroids(args.input)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
all_type_groups = ["All"] + TYPE_GROUPS
|
||||
rows = []
|
||||
|
||||
for tg in all_type_groups:
|
||||
print(f"\n--- {tg} ---")
|
||||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||||
if len(typed) < MIN_PAIRS:
|
||||
print(f" Skipping (only {len(typed)} pairs)")
|
||||
continue
|
||||
|
||||
print(f" {len(typed):,} pairs")
|
||||
|
||||
# National
|
||||
national_prem = compute_premiums_for_group(typed)
|
||||
national_reno = typed.filter(
|
||||
(pl.col("has_extension") > 0)
|
||||
| (pl.col("has_renovation") > 0)
|
||||
| (pl.col("has_remodeling") > 0)
|
||||
).height
|
||||
if not national_prem:
|
||||
print(" No renovation pairs at national level, skipping")
|
||||
continue
|
||||
|
||||
print(
|
||||
" National premiums: "
|
||||
+ ", ".join(
|
||||
f"{et}: {v:.4f} ({math.exp(v) - 1:.1%})"
|
||||
for et, v in national_prem.items()
|
||||
)
|
||||
)
|
||||
|
||||
# Per-level
|
||||
print(" Computing per-level premiums:")
|
||||
area_prem, area_n = compute_premiums_for_level(typed, "area")
|
||||
district_prem, district_n = compute_premiums_for_level(typed, "district")
|
||||
sector_prem, sector_n = compute_premiums_for_level(typed, "sector")
|
||||
print(
|
||||
f" {len(area_prem)} areas, {len(district_prem)} districts, {len(sector_prem)} sectors with data"
|
||||
)
|
||||
|
||||
# Shrinkage
|
||||
print(" Applying shrinkage...")
|
||||
sector_shrunk = apply_shrinkage(
|
||||
sector_prem,
|
||||
sector_n,
|
||||
district_prem,
|
||||
district_n,
|
||||
area_prem,
|
||||
area_n,
|
||||
national_prem,
|
||||
national_reno,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
print(" Spatial smoothing...")
|
||||
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
|
||||
|
||||
# Collect rows
|
||||
for sec in all_sectors:
|
||||
prem = sector_smoothed.get(sec, national_prem)
|
||||
n = sector_n.get(sec, 0)
|
||||
for et in EVENT_TYPES:
|
||||
rows.append((sec, tg, et, prem.get(et, 0.0), n))
|
||||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={
|
||||
"sector": pl.String,
|
||||
"type_group": pl.String,
|
||||
"event_type": pl.String,
|
||||
"log_premium": pl.Float64,
|
||||
"n_reno_pairs": pl.Int64,
|
||||
},
|
||||
orient="row",
|
||||
).sort("type_group", "sector", "event_type")
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {result['sector'].n_unique():,} sectors x {len(all_type_groups)} types x {len(EVENT_TYPES)} events = {len(result):,} rows"
|
||||
)
|
||||
|
||||
# Print summary statistics
|
||||
print("\nNational premium summary:")
|
||||
national = (
|
||||
result.filter(pl.col("type_group") == "All")
|
||||
.group_by("event_type")
|
||||
.agg(
|
||||
pl.col("log_premium").mean().alias("mean_log_premium"),
|
||||
)
|
||||
)
|
||||
for row in national.iter_rows(named=True):
|
||||
et = row["event_type"]
|
||||
lp = row["mean_log_premium"]
|
||||
print(f" {et}: log_premium={lp:.4f} ({math.exp(lp) - 1:.1%} price uplift)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue