523 lines
16 KiB
Python
523 lines
16 KiB
Python
"""Repeat-Sales Price Index (improved)
|
||
|
||
Builds a hierarchical repeat-sales price index with:
|
||
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
|
||
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
|
||
3. National hedonic time-dummy model as ultimate shrinkage fallback
|
||
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
|
||
|
||
Output: price_index.parquet — sector × type_group × year → log_index
|
||
"""
|
||
|
||
import argparse
|
||
from pathlib import Path
|
||
|
||
import numpy as np
|
||
import polars as pl
|
||
from scipy.sparse import csc_matrix
|
||
from scipy.sparse.linalg import lsqr
|
||
from scipy.spatial import KDTree
|
||
from tqdm import tqdm
|
||
|
||
from pipeline.transform._price_utils import (
|
||
CURRENT_YEAR,
|
||
SHRINKAGE_K,
|
||
TYPE_GROUPS,
|
||
build_hedonic_features,
|
||
extract_centroids,
|
||
hierarchy_keys,
|
||
sector_expr,
|
||
type_group_expr,
|
||
)
|
||
|
||
# --- Constants ---
|
||
MIN_PAIRS = 5
|
||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||
HUBER_K = 1.345
|
||
IRLS_ITERATIONS = 5
|
||
SPATIAL_NEIGHBORS = 5
|
||
SPATIAL_BLEND_K = 30
|
||
|
||
|
||
# --- Pair extraction ---
|
||
|
||
|
||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||
print("Extracting repeat-sale pairs...")
|
||
df = (
|
||
pl.scan_parquet(input_path)
|
||
.select("Postcode", "historical_prices", "Property type")
|
||
.filter(
|
||
pl.col("Postcode").is_not_null(),
|
||
pl.col("historical_prices").list.len() >= 2,
|
||
)
|
||
.with_columns(sector_expr(), type_group_expr())
|
||
.collect()
|
||
)
|
||
print(f" {len(df):,} properties with 2+ transactions")
|
||
|
||
pairs = (
|
||
df.lazy()
|
||
.with_columns(
|
||
pl.col("historical_prices")
|
||
.list.slice(0, pl.col("historical_prices").list.len() - 1)
|
||
.alias("from_txn"),
|
||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||
)
|
||
.explode("from_txn", "to_txn")
|
||
.with_columns(
|
||
pl.col("from_txn").struct.field("year").alias("year1"),
|
||
pl.col("from_txn").struct.field("price").alias("price1"),
|
||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||
)
|
||
.select("sector", "type_group", "year1", "price1", "year2", "price2")
|
||
.filter(
|
||
pl.col("price1") > 0,
|
||
pl.col("price2") > 0,
|
||
pl.col("year2") > pl.col("year1"),
|
||
)
|
||
.with_columns(
|
||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
|
||
.log()
|
||
.alias("log_ratio"),
|
||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
|
||
"weight"
|
||
),
|
||
)
|
||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||
.collect()
|
||
)
|
||
|
||
# Add hierarchy columns
|
||
pairs = pairs.with_columns(
|
||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||
).with_columns(
|
||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||
)
|
||
|
||
print(f" {len(pairs):,} pairs extracted")
|
||
return pairs
|
||
|
||
|
||
# --- Robust IRLS solver ---
|
||
|
||
|
||
def solve_robust_index(
|
||
years1: np.ndarray,
|
||
years2: np.ndarray,
|
||
log_ratios: np.ndarray,
|
||
base_weights: np.ndarray,
|
||
) -> dict[int, float]:
|
||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||
n = len(years1)
|
||
if n < MIN_PAIRS:
|
||
return {}
|
||
|
||
all_years = np.union1d(years1, years2)
|
||
min_year = int(all_years.min())
|
||
|
||
col = 0
|
||
year_to_col = {}
|
||
for y in all_years:
|
||
iy = int(y)
|
||
if iy != min_year:
|
||
year_to_col[iy] = col
|
||
col += 1
|
||
n_cols = len(year_to_col)
|
||
if n_cols == 0:
|
||
return {}
|
||
|
||
# Vectorized column index mapping
|
||
col2 = np.full(n, -1, dtype=np.int32)
|
||
col1 = np.full(n, -1, dtype=np.int32)
|
||
for year, c in year_to_col.items():
|
||
col2[years2 == year] = c
|
||
col1[years1 == year] = c
|
||
|
||
# Sparse matrix structure (fixed across iterations)
|
||
mask2 = col2 >= 0
|
||
mask1 = col1 >= 0
|
||
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
|
||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||
|
||
weights = base_weights.copy()
|
||
|
||
for _ in range(IRLS_ITERATIONS):
|
||
data = signs_arr * weights[rows_arr]
|
||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||
b = log_ratios * weights
|
||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||
|
||
# Residuals
|
||
predicted = np.zeros(n)
|
||
predicted[mask2] += betas[col2[mask2]]
|
||
predicted[mask1] -= betas[col1[mask1]]
|
||
residuals = log_ratios - predicted
|
||
|
||
# Huber reweighting
|
||
abs_r = np.abs(residuals)
|
||
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
|
||
weights = base_weights * huber_w
|
||
|
||
index = {min_year: 0.0}
|
||
for year, c in year_to_col.items():
|
||
index[year] = float(betas[c])
|
||
return index
|
||
|
||
|
||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||
groups = pairs.group_by(group_col).agg(
|
||
pl.col("year1"),
|
||
pl.col("year2"),
|
||
pl.col("log_ratio"),
|
||
pl.col("weight"),
|
||
)
|
||
indices = {}
|
||
n_pairs = {}
|
||
for row in tqdm(
|
||
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
|
||
):
|
||
key = row[group_col]
|
||
y1 = np.array(row["year1"], dtype=np.int32)
|
||
y2 = np.array(row["year2"], dtype=np.int32)
|
||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||
w = np.array(row["weight"], dtype=np.float64)
|
||
idx = solve_robust_index(y1, y2, lr, w)
|
||
if idx:
|
||
indices[key] = idx
|
||
n_pairs[key] = len(y1)
|
||
return indices, n_pairs
|
||
|
||
|
||
# --- Hedonic model ---
|
||
|
||
|
||
def compute_hedonic_index(
|
||
input_path: Path, min_year: int, max_year: int
|
||
) -> dict[int, float]:
|
||
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
|
||
print("Computing hedonic index...")
|
||
df = (
|
||
pl.scan_parquet(input_path)
|
||
.select(
|
||
"Last known price",
|
||
"Date of last transaction",
|
||
"Property type",
|
||
"Total floor area (sqm)",
|
||
)
|
||
.filter(
|
||
pl.col("Last known price").is_not_null(),
|
||
pl.col("Total floor area (sqm)").is_not_null(),
|
||
pl.col("Total floor area (sqm)") > 0,
|
||
)
|
||
.with_columns(
|
||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||
type_group_expr(),
|
||
)
|
||
.filter(
|
||
pl.col("type_group").is_not_null(),
|
||
pl.col("sale_year").is_not_null(),
|
||
pl.col("sale_year") >= min_year,
|
||
pl.col("sale_year") <= max_year,
|
||
)
|
||
.collect()
|
||
)
|
||
print(f" {len(df):,} complete cases for hedonic model")
|
||
|
||
# Target
|
||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||
sale_years = df["sale_year"].to_numpy()
|
||
|
||
# Build feature matrix (18 hedonic features + intercept)
|
||
X = build_hedonic_features(df)
|
||
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
|
||
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
|
||
|
||
# Step 1: regress log(price) on features → quality score
|
||
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
|
||
quality_score = F.astype(np.float64) @ betas
|
||
residuals = log_price - quality_score
|
||
|
||
# Step 2: average residual by year = hedonic index
|
||
hedonic = {}
|
||
for y in range(min_year, max_year + 1):
|
||
mask = sale_years == y
|
||
if mask.sum() > 0:
|
||
hedonic[y] = float(np.mean(residuals[mask]))
|
||
|
||
# Normalize: min_year = 0
|
||
base = hedonic.get(min_year, 0.0)
|
||
for y in hedonic:
|
||
hedonic[y] -= base
|
||
|
||
print(
|
||
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
|
||
)
|
||
return hedonic
|
||
|
||
|
||
# --- Shrinkage ---
|
||
|
||
|
||
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
|
||
w = n_pairs / (n_pairs + k)
|
||
result = {}
|
||
for y in set(raw) | set(parent):
|
||
r = raw.get(y, parent.get(y, 0.0))
|
||
p = parent.get(y, raw.get(y, 0.0))
|
||
result[y] = w * r + (1 - w) * p
|
||
return result
|
||
|
||
|
||
def apply_shrinkage(
|
||
sector_idx,
|
||
sector_n,
|
||
district_idx,
|
||
district_n,
|
||
area_idx,
|
||
area_n,
|
||
national_idx,
|
||
national_n,
|
||
hedonic_idx,
|
||
all_sectors,
|
||
sector_to_dist,
|
||
dist_to_area,
|
||
):
|
||
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
|
||
# National → hedonic
|
||
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
|
||
|
||
# Area → national
|
||
area_shrunk = {}
|
||
for area, idx in area_idx.items():
|
||
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
|
||
|
||
# District → area
|
||
district_shrunk = {}
|
||
for dist, idx in district_idx.items():
|
||
a = dist_to_area.get(dist, "")
|
||
parent = area_shrunk.get(a, national_shrunk)
|
||
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
|
||
|
||
# Sector → district
|
||
sector_shrunk = {}
|
||
for sec, idx in sector_idx.items():
|
||
d = sector_to_dist.get(sec, "")
|
||
parent = district_shrunk.get(d, national_shrunk)
|
||
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
|
||
|
||
# Fill sectors without their own index
|
||
for sec in all_sectors:
|
||
if sec not in sector_shrunk:
|
||
d = sector_to_dist.get(sec, "")
|
||
a = dist_to_area.get(d, "")
|
||
sector_shrunk[sec] = district_shrunk.get(
|
||
d, area_shrunk.get(a, national_shrunk)
|
||
)
|
||
|
||
return sector_shrunk
|
||
|
||
|
||
# --- Spatial smoothing ---
|
||
|
||
|
||
def spatial_smooth(
|
||
sector_indices: dict,
|
||
centroids: dict,
|
||
n_pairs_map: dict,
|
||
) -> dict:
|
||
"""Blend sparse sector indices with K nearest neighbors."""
|
||
# Build coordinate arrays for sectors with centroids
|
||
sectors_with_coords = [s for s in sector_indices if s in centroids]
|
||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||
return sector_indices
|
||
|
||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
|
||
mean_lat = np.mean(coords[:, 0])
|
||
scale = np.cos(np.radians(mean_lat))
|
||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||
tree = KDTree(scaled_coords)
|
||
|
||
result = dict(sector_indices)
|
||
for i, sec in enumerate(sectors_with_coords):
|
||
n = n_pairs_map.get(sec, 0)
|
||
self_w = n / (n + SPATIAL_BLEND_K)
|
||
if self_w > 0.95:
|
||
continue # enough data, skip smoothing
|
||
|
||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||
# Skip self (index 0, distance ~0)
|
||
neighbor_dists = dists[1:]
|
||
neighbor_idxs = idxs[1:]
|
||
|
||
inv_dists = []
|
||
neighbor_indices = []
|
||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||
ns = sectors_with_coords[j]
|
||
if d > 0 and ns in sector_indices:
|
||
inv_dists.append(1.0 / d)
|
||
neighbor_indices.append(sector_indices[ns])
|
||
|
||
if not neighbor_indices:
|
||
continue
|
||
|
||
total_inv = sum(inv_dists)
|
||
nbr_w = 1.0 - self_w
|
||
ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||
|
||
blended = {}
|
||
all_years = set(sector_indices[sec])
|
||
for ni in neighbor_indices:
|
||
all_years |= set(ni)
|
||
for y in all_years:
|
||
val = self_w * sector_indices[sec].get(y, 0.0)
|
||
for ni, w in zip(neighbor_indices, ws):
|
||
val += w * ni.get(y, 0.0)
|
||
blended[y] = val
|
||
result[sec] = blended
|
||
|
||
return result
|
||
|
||
|
||
# --- Forward fill ---
|
||
|
||
|
||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||
filled = {}
|
||
last = 0.0
|
||
for y in range(min_year, max_year + 1):
|
||
if y in index:
|
||
last = index[y]
|
||
filled[y] = last
|
||
return filled
|
||
|
||
|
||
# --- Main ---
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Build improved repeat-sales price index"
|
||
)
|
||
parser.add_argument("--input", type=Path, required=True)
|
||
parser.add_argument("--output", type=Path, required=True)
|
||
args = parser.parse_args()
|
||
|
||
pairs = extract_pairs(args.input)
|
||
centroids = extract_centroids(args.input)
|
||
|
||
min_year = int(pairs["year1"].min())
|
||
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
|
||
|
||
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
|
||
|
||
# Precompute hierarchy
|
||
all_sectors = pairs["sector"].unique().to_list()
|
||
sector_to_dist = {}
|
||
dist_to_area = {}
|
||
for s in all_sectors:
|
||
d, a = hierarchy_keys(s)
|
||
sector_to_dist[s] = d
|
||
dist_to_area[d] = a
|
||
|
||
# Process each type group + "All"
|
||
all_type_groups = ["All"] + TYPE_GROUPS
|
||
final = {} # {type_group: {sector: {year: log_index}}}
|
||
final_n = {} # {type_group: {sector: n_pairs}}
|
||
|
||
for tg in all_type_groups:
|
||
print(f"\n--- {tg} ---")
|
||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||
if len(typed) < MIN_PAIRS:
|
||
print(f" Skipping (only {len(typed)} pairs)")
|
||
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
|
||
final_n[tg] = {s: 0 for s in all_sectors}
|
||
continue
|
||
|
||
print(f" {len(typed):,} pairs")
|
||
|
||
# National
|
||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||
national_idx = solve_robust_index(
|
||
np_arrs["year1"].to_numpy(),
|
||
np_arrs["year2"].to_numpy(),
|
||
np_arrs["log_ratio"].to_numpy(),
|
||
np_arrs["weight"].to_numpy(),
|
||
)
|
||
national_n = len(typed)
|
||
print(f" National: {len(national_idx)} years")
|
||
|
||
# Area, district, sector
|
||
print(" Computing per-level indices:")
|
||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||
print(
|
||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||
)
|
||
|
||
# Shrinkage
|
||
print(" Applying shrinkage...")
|
||
sector_shrunk = apply_shrinkage(
|
||
sector_idx,
|
||
sector_n,
|
||
district_idx,
|
||
district_n,
|
||
area_idx,
|
||
area_n,
|
||
national_idx,
|
||
national_n,
|
||
hedonic_idx,
|
||
all_sectors,
|
||
sector_to_dist,
|
||
dist_to_area,
|
||
)
|
||
|
||
# Spatial smoothing
|
||
print(" Spatial smoothing...")
|
||
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
|
||
|
||
# Forward fill
|
||
for sec in all_sectors:
|
||
sector_smoothed[sec] = forward_fill(
|
||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||
)
|
||
|
||
final[tg] = sector_smoothed
|
||
final_n[tg] = sector_n
|
||
|
||
# Assemble output
|
||
print("\nAssembling output...")
|
||
rows = []
|
||
for tg in all_type_groups:
|
||
for sec in all_sectors:
|
||
n = final_n[tg].get(sec, 0)
|
||
for year, log_idx in final[tg][sec].items():
|
||
rows.append((sec, tg, year, log_idx, n))
|
||
|
||
result = pl.DataFrame(
|
||
rows,
|
||
schema={
|
||
"sector": pl.String,
|
||
"type_group": pl.String,
|
||
"year": pl.Int32,
|
||
"log_index": pl.Float64,
|
||
"n_pairs": pl.Int64,
|
||
},
|
||
orient="row",
|
||
).sort("type_group", "sector", "year")
|
||
|
||
result.write_parquet(args.output)
|
||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||
print(
|
||
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|