perfect-postcode/pipeline/transform/hedonic_quality.py
2026-02-15 22:39:53 +00:00

300 lines
9.7 KiB
Python

"""Cross-Sectional Hedonic Model (Per-Type)
Trains separate OLS models per property type on recent sales (last 5 years)
with sector fixed effects via Frisch-Waugh-Lovell demeaning:
log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
Each type gets its own floor area elasticity and sector intercepts, capturing
that detached houses (beta=0.74) have higher price sensitivity to size than
terraced houses (beta=0.60), and a sector's value differs by property type.
Sector intercepts are hierarchically shrunk (sector → district → area → national)
and spatially smoothed via KD-tree nearest neighbors.
Output: hedonic_model.json with per-type betas and sector intercepts.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
SHRINKAGE_K,
TYPE_GROUPS,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
TRAINING_YEARS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def load_training_data(input_path: Path) -> pl.DataFrame:
"""Load recent sales with complete hedonic features."""
min_year = CURRENT_YEAR - TRAINING_YEARS
print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
df = (
pl.scan_parquet(input_path)
.select(*HEDONIC_COLUMNS)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Postcode").is_not_null(),
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
sector_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= CURRENT_YEAR,
)
.collect()
)
print(f" {len(df):,} complete cases")
return df
def train_type_model(
df: pl.DataFrame, type_group: str
) -> tuple[float, dict[str, float], dict[str, int], float]:
"""Train hedonic model for a single property type.
Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
"""
t_df = df.filter(pl.col("type_group") == type_group)
y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
log_fa = np.log(
np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
)
X = log_fa.reshape(-1, 1)
sectors = t_df["sector"].to_list()
# Group by sector for demeaning
sector_indices: dict[str, list[int]] = {}
for i, s in enumerate(sectors):
sector_indices.setdefault(s, []).append(i)
# Compute sector means and demean
X_demeaned = np.empty_like(X)
y_demeaned = np.empty_like(y)
sector_X_means: dict[str, np.ndarray] = {}
sector_y_means: dict[str, float] = {}
sector_counts: dict[str, int] = {}
for s, idxs in sector_indices.items():
idx = np.array(idxs)
X_mean = X[idx].mean(axis=0)
y_mean = y[idx].mean()
sector_X_means[s] = X_mean
sector_y_means[s] = y_mean
X_demeaned[idx] = X[idx] - X_mean
y_demeaned[idx] = y[idx] - y_mean
sector_counts[s] = len(idxs)
# OLS on demeaned data
beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
beta_fa = float(beta[0])
# Recover sector intercepts
sector_intercepts = {}
for s in sector_indices:
sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
national_intercept = float(np.mean(list(sector_intercepts.values())))
# R-squared
y_pred = X[:, 0] * beta_fa
for i, s in enumerate(sectors):
y_pred[i] += sector_intercepts[s]
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2 = 1 - ss_res / ss_tot
print(
f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} "
f"R²={r2:.4f} sectors={len(sector_intercepts):,}"
)
return beta_fa, sector_intercepts, sector_counts, national_intercept
def shrink_intercepts(
sector_intercepts: dict[str, float],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Hierarchical shrinkage: sector -> district -> area -> national."""
national = float(np.mean(list(sector_intercepts.values())))
sector_to_dist: dict[str, str] = {}
dist_to_area: dict[str, str] = {}
for s in sector_intercepts:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Area-level intercepts (weighted mean of sectors in area)
area_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
a = dist_to_area[d]
area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
area_intercepts: dict[str, float] = {}
area_counts: dict[str, int] = {}
for a, entries in area_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
area_intercepts[a] = sum(v * n for v, n in entries) / total_n
else:
area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
area_counts[a] = total_n
# District-level intercepts
dist_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
dist_intercepts: dict[str, float] = {}
dist_counts: dict[str, int] = {}
for d, entries in dist_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
else:
dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
dist_counts[d] = total_n
# Shrink: area -> national
area_shrunk: dict[str, float] = {}
for a, val in area_intercepts.items():
n = area_counts[a]
w = n / (n + SHRINKAGE_K)
area_shrunk[a] = w * val + (1 - w) * national
# Shrink: district -> area
dist_shrunk: dict[str, float] = {}
for d, val in dist_intercepts.items():
a = dist_to_area[d]
parent = area_shrunk.get(a, national)
n = dist_counts[d]
w = n / (n + SHRINKAGE_K)
dist_shrunk[d] = w * val + (1 - w) * parent
# Shrink: sector -> district
result: dict[str, float] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
parent = dist_shrunk.get(d, national)
n = sector_counts.get(s, 0)
w = n / (n + SHRINKAGE_K)
result[s] = w * val + (1 - w) * parent
return result
def spatial_smooth_intercepts(
sector_intercepts: dict[str, float],
centroids: dict[str, tuple[float, float]],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Blend sparse sector intercepts with K nearest neighbors."""
sectors_with_coords = [s for s in sector_intercepts if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_intercepts
coords = np.array([centroids[s] for s in sectors_with_coords])
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_intercepts)
for i, sec in enumerate(sectors_with_coords):
n = sector_counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_vals = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_intercepts:
inv_dists.append(1.0 / d)
neighbor_vals.append(sector_intercepts[ns])
if not neighbor_vals:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
blended = self_w * sector_intercepts[sec]
for val, iw in zip(neighbor_vals, inv_dists):
blended += nbr_w * (iw / total_inv) * val
result[sec] = blended
return result
def main():
parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output hedonic_model.json"
)
args = parser.parse_args()
df = load_training_data(args.input)
centroids = extract_centroids(args.input)
print("\nTraining per-type models...")
type_models = {}
total_sectors = 0
for tg in TYPE_GROUPS:
beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
shrunk = shrink_intercepts(raw_intercepts, sector_counts)
smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
total_sectors += len(smoothed)
type_models[tg] = {
"beta_fa": beta_fa,
"sector_intercepts": smoothed,
"national_intercept": national,
}
# Output
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
json.dump({"type_models": type_models}, f, indent=2)
size_kb = args.output.stat().st_size / 1024
print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
if __name__ == "__main__":
main()