perfect-postcode/pipeline/transform/price_index.py
2026-02-15 22:39:53 +00:00

523 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Repeat-Sales Price Index (improved)
Builds a hierarchical repeat-sales price index with:
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
3. National hedonic time-dummy model as ultimate shrinkage fallback
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
Output: price_index.parquet — sector × type_group × year → log_index
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import lsqr
from scipy.spatial import KDTree
from tqdm import tqdm
from pipeline.transform._price_utils import (
CURRENT_YEAR,
SHRINKAGE_K,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
# --- Constants ---
MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
# --- Pair extraction ---
def extract_pairs(input_path: Path) -> pl.DataFrame:
print("Extracting repeat-sale pairs...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
.with_columns(
pl.col("from_txn").struct.field("year").alias("year1"),
pl.col("from_txn").struct.field("price").alias("price1"),
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select("sector", "type_group", "year1", "price1", "year2", "price2")
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
"weight"
),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
print(f" {len(pairs):,} pairs extracted")
return pairs
# --- Robust IRLS solver ---
def solve_robust_index(
years1: np.ndarray,
years2: np.ndarray,
log_ratios: np.ndarray,
base_weights: np.ndarray,
) -> dict[int, float]:
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
n = len(years1)
if n < MIN_PAIRS:
return {}
all_years = np.union1d(years1, years2)
min_year = int(all_years.min())
col = 0
year_to_col = {}
for y in all_years:
iy = int(y)
if iy != min_year:
year_to_col[iy] = col
col += 1
n_cols = len(year_to_col)
if n_cols == 0:
return {}
# Vectorized column index mapping
col2 = np.full(n, -1, dtype=np.int32)
col1 = np.full(n, -1, dtype=np.int32)
for year, c in year_to_col.items():
col2[years2 == year] = c
col1[years1 == year] = c
# Sparse matrix structure (fixed across iterations)
mask2 = col2 >= 0
mask1 = col1 >= 0
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals
predicted = np.zeros(n)
predicted[mask2] += betas[col2[mask2]]
predicted[mask1] -= betas[col1[mask1]]
residuals = log_ratios - predicted
# Huber reweighting
abs_r = np.abs(residuals)
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
weights = base_weights * huber_w
index = {min_year: 0.0}
for year, c in year_to_col.items():
index[year] = float(betas[c])
return index
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
groups = pairs.group_by(group_col).agg(
pl.col("year1"),
pl.col("year2"),
pl.col("log_ratio"),
pl.col("weight"),
)
indices = {}
n_pairs = {}
for row in tqdm(
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
):
key = row[group_col]
y1 = np.array(row["year1"], dtype=np.int32)
y2 = np.array(row["year2"], dtype=np.int32)
lr = np.array(row["log_ratio"], dtype=np.float64)
w = np.array(row["weight"], dtype=np.float64)
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
return indices, n_pairs
# --- Hedonic model ---
def compute_hedonic_index(
input_path: Path, min_year: int, max_year: int
) -> dict[int, float]:
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
print("Computing hedonic index...")
df = (
pl.scan_parquet(input_path)
.select(
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= max_year,
)
.collect()
)
print(f" {len(df):,} complete cases for hedonic model")
# Target
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
sale_years = df["sale_year"].to_numpy()
# Build feature matrix (18 hedonic features + intercept)
X = build_hedonic_features(df)
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
# Step 1: regress log(price) on features → quality score
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
quality_score = F.astype(np.float64) @ betas
residuals = log_price - quality_score
# Step 2: average residual by year = hedonic index
hedonic = {}
for y in range(min_year, max_year + 1):
mask = sale_years == y
if mask.sum() > 0:
hedonic[y] = float(np.mean(residuals[mask]))
# Normalize: min_year = 0
base = hedonic.get(min_year, 0.0)
for y in hedonic:
hedonic[y] -= base
print(
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
)
return hedonic
# --- Shrinkage ---
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
w = n_pairs / (n_pairs + k)
result = {}
for y in set(raw) | set(parent):
r = raw.get(y, parent.get(y, 0.0))
p = parent.get(y, raw.get(y, 0.0))
result[y] = w * r + (1 - w) * p
return result
def apply_shrinkage(
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
):
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
# National → hedonic
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
# Area → national
area_shrunk = {}
for area, idx in area_idx.items():
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
# District → area
district_shrunk = {}
for dist, idx in district_idx.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, national_shrunk)
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
# Sector → district
sector_shrunk = {}
for sec, idx in sector_idx.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, national_shrunk)
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
# Fill sectors without their own index
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(
d, area_shrunk.get(a, national_shrunk)
)
return sector_shrunk
# --- Spatial smoothing ---
def spatial_smooth(
sector_indices: dict,
centroids: dict,
n_pairs_map: dict,
) -> dict:
"""Blend sparse sector indices with K nearest neighbors."""
# Build coordinate arrays for sectors with centroids
sectors_with_coords = [s for s in sector_indices if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_indices
coords = np.array([centroids[s] for s in sectors_with_coords])
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_indices)
for i, sec in enumerate(sectors_with_coords):
n = n_pairs_map.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_indices = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_indices:
inv_dists.append(1.0 / d)
neighbor_indices.append(sector_indices[ns])
if not neighbor_indices:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
ws = [iw / total_inv * nbr_w for iw in inv_dists]
blended = {}
all_years = set(sector_indices[sec])
for ni in neighbor_indices:
all_years |= set(ni)
for y in all_years:
val = self_w * sector_indices[sec].get(y, 0.0)
for ni, w in zip(neighbor_indices, ws):
val += w * ni.get(y, 0.0)
blended[y] = val
result[sec] = blended
return result
# --- Forward fill ---
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
filled = {}
last = 0.0
for y in range(min_year, max_year + 1):
if y in index:
last = index[y]
filled[y] = last
return filled
# --- Main ---
def main():
parser = argparse.ArgumentParser(
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
pairs = extract_pairs(args.input)
centroids = extract_centroids(args.input)
min_year = int(pairs["year1"].min())
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Process each type group + "All"
all_type_groups = ["All"] + TYPE_GROUPS
final = {} # {type_group: {sector: {year: log_index}}}
final_n = {} # {type_group: {sector: n_pairs}}
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
final_n[tg] = {s: 0 for s in all_sectors}
continue
print(f" {len(typed):,} pairs")
# National
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_robust_index(
np_arrs["year1"].to_numpy(),
np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(),
np_arrs["weight"].to_numpy(),
)
national_n = len(typed)
print(f" National: {len(national_idx)} years")
# Area, district, sector
print(" Computing per-level indices:")
area_idx, area_n = compute_indices_for_level(typed, "area")
district_idx, district_n = compute_indices_for_level(typed, "district")
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
print(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
# Forward fill
for sec in all_sectors:
sector_smoothed[sec] = forward_fill(
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
)
final[tg] = sector_smoothed
final_n[tg] = sector_n
# Assemble output
print("\nAssembling output...")
rows = []
for tg in all_type_groups:
for sec in all_sectors:
n = final_n[tg].get(sec, 0)
for year, log_idx in final[tg][sec].items():
rows.append((sec, tg, year, log_idx, n))
result = pl.DataFrame(
rows,
schema={
"sector": pl.String,
"type_group": pl.String,
"year": pl.Int32,
"log_index": pl.Float64,
"n_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "year")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
)
if __name__ == "__main__":
main()