300 lines
9.7 KiB
Python
300 lines
9.7 KiB
Python
"""Cross-Sectional Hedonic Model (Per-Type)
|
|
|
|
Trains separate OLS models per property type on recent sales (last 5 years)
|
|
with sector fixed effects via Frisch-Waugh-Lovell demeaning:
|
|
|
|
log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
|
|
|
|
Each type gets its own floor area elasticity and sector intercepts, capturing
|
|
that detached houses (beta=0.74) have higher price sensitivity to size than
|
|
terraced houses (beta=0.60), and a sector's value differs by property type.
|
|
|
|
Sector intercepts are hierarchically shrunk (sector → district → area → national)
|
|
and spatially smoothed via KD-tree nearest neighbors.
|
|
|
|
Output: hedonic_model.json with per-type betas and sector intercepts.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
from scipy.spatial import KDTree
|
|
|
|
from pipeline.transform._price_utils import (
|
|
CURRENT_YEAR,
|
|
HEDONIC_COLUMNS,
|
|
SHRINKAGE_K,
|
|
TYPE_GROUPS,
|
|
extract_centroids,
|
|
hierarchy_keys,
|
|
sector_expr,
|
|
type_group_expr,
|
|
)
|
|
|
|
TRAINING_YEARS = 5
|
|
SPATIAL_NEIGHBORS = 5
|
|
SPATIAL_BLEND_K = 30
|
|
|
|
|
|
def load_training_data(input_path: Path) -> pl.DataFrame:
|
|
"""Load recent sales with complete hedonic features."""
|
|
min_year = CURRENT_YEAR - TRAINING_YEARS
|
|
print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
|
|
df = (
|
|
pl.scan_parquet(input_path)
|
|
.select(*HEDONIC_COLUMNS)
|
|
.filter(
|
|
pl.col("Last known price").is_not_null(),
|
|
pl.col("Total floor area (sqm)").is_not_null(),
|
|
pl.col("Total floor area (sqm)") > 0,
|
|
pl.col("Postcode").is_not_null(),
|
|
)
|
|
.with_columns(
|
|
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
|
type_group_expr(),
|
|
sector_expr(),
|
|
)
|
|
.filter(
|
|
pl.col("type_group").is_not_null(),
|
|
pl.col("sale_year").is_not_null(),
|
|
pl.col("sale_year") >= min_year,
|
|
pl.col("sale_year") <= CURRENT_YEAR,
|
|
)
|
|
.collect()
|
|
)
|
|
print(f" {len(df):,} complete cases")
|
|
return df
|
|
|
|
|
|
def train_type_model(
|
|
df: pl.DataFrame, type_group: str
|
|
) -> tuple[float, dict[str, float], dict[str, int], float]:
|
|
"""Train hedonic model for a single property type.
|
|
|
|
Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
|
|
"""
|
|
t_df = df.filter(pl.col("type_group") == type_group)
|
|
y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
|
|
log_fa = np.log(
|
|
np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
|
|
)
|
|
X = log_fa.reshape(-1, 1)
|
|
sectors = t_df["sector"].to_list()
|
|
|
|
# Group by sector for demeaning
|
|
sector_indices: dict[str, list[int]] = {}
|
|
for i, s in enumerate(sectors):
|
|
sector_indices.setdefault(s, []).append(i)
|
|
|
|
# Compute sector means and demean
|
|
X_demeaned = np.empty_like(X)
|
|
y_demeaned = np.empty_like(y)
|
|
sector_X_means: dict[str, np.ndarray] = {}
|
|
sector_y_means: dict[str, float] = {}
|
|
sector_counts: dict[str, int] = {}
|
|
|
|
for s, idxs in sector_indices.items():
|
|
idx = np.array(idxs)
|
|
X_mean = X[idx].mean(axis=0)
|
|
y_mean = y[idx].mean()
|
|
sector_X_means[s] = X_mean
|
|
sector_y_means[s] = y_mean
|
|
X_demeaned[idx] = X[idx] - X_mean
|
|
y_demeaned[idx] = y[idx] - y_mean
|
|
sector_counts[s] = len(idxs)
|
|
|
|
# OLS on demeaned data
|
|
beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
|
|
beta_fa = float(beta[0])
|
|
|
|
# Recover sector intercepts
|
|
sector_intercepts = {}
|
|
for s in sector_indices:
|
|
sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
|
|
|
|
national_intercept = float(np.mean(list(sector_intercepts.values())))
|
|
|
|
# R-squared
|
|
y_pred = X[:, 0] * beta_fa
|
|
for i, s in enumerate(sectors):
|
|
y_pred[i] += sector_intercepts[s]
|
|
ss_res = np.sum((y - y_pred) ** 2)
|
|
ss_tot = np.sum((y - y.mean()) ** 2)
|
|
r2 = 1 - ss_res / ss_tot
|
|
|
|
print(
|
|
f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} "
|
|
f"R²={r2:.4f} sectors={len(sector_intercepts):,}"
|
|
)
|
|
|
|
return beta_fa, sector_intercepts, sector_counts, national_intercept
|
|
|
|
|
|
def shrink_intercepts(
|
|
sector_intercepts: dict[str, float],
|
|
sector_counts: dict[str, int],
|
|
) -> dict[str, float]:
|
|
"""Hierarchical shrinkage: sector -> district -> area -> national."""
|
|
national = float(np.mean(list(sector_intercepts.values())))
|
|
|
|
sector_to_dist: dict[str, str] = {}
|
|
dist_to_area: dict[str, str] = {}
|
|
for s in sector_intercepts:
|
|
d, a = hierarchy_keys(s)
|
|
sector_to_dist[s] = d
|
|
dist_to_area[d] = a
|
|
|
|
# Area-level intercepts (weighted mean of sectors in area)
|
|
area_vals: dict[str, list[tuple[float, int]]] = {}
|
|
for s, val in sector_intercepts.items():
|
|
d = sector_to_dist[s]
|
|
a = dist_to_area[d]
|
|
area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
|
|
|
|
area_intercepts: dict[str, float] = {}
|
|
area_counts: dict[str, int] = {}
|
|
for a, entries in area_vals.items():
|
|
total_n = sum(n for _, n in entries)
|
|
if total_n > 0:
|
|
area_intercepts[a] = sum(v * n for v, n in entries) / total_n
|
|
else:
|
|
area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
|
|
area_counts[a] = total_n
|
|
|
|
# District-level intercepts
|
|
dist_vals: dict[str, list[tuple[float, int]]] = {}
|
|
for s, val in sector_intercepts.items():
|
|
d = sector_to_dist[s]
|
|
dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
|
|
|
|
dist_intercepts: dict[str, float] = {}
|
|
dist_counts: dict[str, int] = {}
|
|
for d, entries in dist_vals.items():
|
|
total_n = sum(n for _, n in entries)
|
|
if total_n > 0:
|
|
dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
|
|
else:
|
|
dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
|
|
dist_counts[d] = total_n
|
|
|
|
# Shrink: area -> national
|
|
area_shrunk: dict[str, float] = {}
|
|
for a, val in area_intercepts.items():
|
|
n = area_counts[a]
|
|
w = n / (n + SHRINKAGE_K)
|
|
area_shrunk[a] = w * val + (1 - w) * national
|
|
|
|
# Shrink: district -> area
|
|
dist_shrunk: dict[str, float] = {}
|
|
for d, val in dist_intercepts.items():
|
|
a = dist_to_area[d]
|
|
parent = area_shrunk.get(a, national)
|
|
n = dist_counts[d]
|
|
w = n / (n + SHRINKAGE_K)
|
|
dist_shrunk[d] = w * val + (1 - w) * parent
|
|
|
|
# Shrink: sector -> district
|
|
result: dict[str, float] = {}
|
|
for s, val in sector_intercepts.items():
|
|
d = sector_to_dist[s]
|
|
parent = dist_shrunk.get(d, national)
|
|
n = sector_counts.get(s, 0)
|
|
w = n / (n + SHRINKAGE_K)
|
|
result[s] = w * val + (1 - w) * parent
|
|
|
|
return result
|
|
|
|
|
|
def spatial_smooth_intercepts(
|
|
sector_intercepts: dict[str, float],
|
|
centroids: dict[str, tuple[float, float]],
|
|
sector_counts: dict[str, int],
|
|
) -> dict[str, float]:
|
|
"""Blend sparse sector intercepts with K nearest neighbors."""
|
|
sectors_with_coords = [s for s in sector_intercepts if s in centroids]
|
|
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
|
return sector_intercepts
|
|
|
|
coords = np.array([centroids[s] for s in sectors_with_coords])
|
|
mean_lat = np.mean(coords[:, 0])
|
|
scale = np.cos(np.radians(mean_lat))
|
|
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
|
tree = KDTree(scaled_coords)
|
|
|
|
result = dict(sector_intercepts)
|
|
for i, sec in enumerate(sectors_with_coords):
|
|
n = sector_counts.get(sec, 0)
|
|
self_w = n / (n + SPATIAL_BLEND_K)
|
|
if self_w > 0.95:
|
|
continue
|
|
|
|
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
|
neighbor_dists = dists[1:]
|
|
neighbor_idxs = idxs[1:]
|
|
|
|
inv_dists = []
|
|
neighbor_vals = []
|
|
for d, j in zip(neighbor_dists, neighbor_idxs):
|
|
ns = sectors_with_coords[j]
|
|
if d > 0 and ns in sector_intercepts:
|
|
inv_dists.append(1.0 / d)
|
|
neighbor_vals.append(sector_intercepts[ns])
|
|
|
|
if not neighbor_vals:
|
|
continue
|
|
|
|
total_inv = sum(inv_dists)
|
|
nbr_w = 1.0 - self_w
|
|
blended = self_w * sector_intercepts[sec]
|
|
for val, iw in zip(neighbor_vals, inv_dists):
|
|
blended += nbr_w * (iw / total_inv) * val
|
|
result[sec] = blended
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
|
|
parser.add_argument(
|
|
"--input", type=Path, required=True, help="Path to wide.parquet"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output hedonic_model.json"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
df = load_training_data(args.input)
|
|
centroids = extract_centroids(args.input)
|
|
|
|
print("\nTraining per-type models...")
|
|
type_models = {}
|
|
total_sectors = 0
|
|
|
|
for tg in TYPE_GROUPS:
|
|
beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
|
|
|
|
shrunk = shrink_intercepts(raw_intercepts, sector_counts)
|
|
smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
|
|
total_sectors += len(smoothed)
|
|
|
|
type_models[tg] = {
|
|
"beta_fa": beta_fa,
|
|
"sector_intercepts": smoothed,
|
|
"national_intercept": national,
|
|
}
|
|
|
|
# Output
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output, "w") as f:
|
|
json.dump({"type_models": type_models}, f, indent=2)
|
|
|
|
size_kb = args.output.stat().st_size / 1024
|
|
print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
|
|
print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|