"""Cross-Sectional Hedonic Model (Per-Type) Trains separate OLS models per property type on recent sales (last 5 years) with sector fixed effects via Frisch-Waugh-Lovell demeaning: log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon Each type gets its own floor area elasticity and sector intercepts, capturing that detached houses (beta=0.74) have higher price sensitivity to size than terraced houses (beta=0.60), and a sector's value differs by property type. Sector intercepts are hierarchically shrunk (sector → district → area → national) and spatially smoothed via KD-tree nearest neighbors. Output: hedonic_model.json with per-type betas and sector intercepts. """ import argparse import json from pathlib import Path import numpy as np import polars as pl from scipy.spatial import KDTree from pipeline.transform._price_utils import ( CURRENT_YEAR, HEDONIC_COLUMNS, SHRINKAGE_K, TYPE_GROUPS, extract_centroids, hierarchy_keys, sector_expr, type_group_expr, ) TRAINING_YEARS = 5 SPATIAL_NEIGHBORS = 5 SPATIAL_BLEND_K = 30 def load_training_data(input_path: Path) -> pl.DataFrame: """Load recent sales with complete hedonic features.""" min_year = CURRENT_YEAR - TRAINING_YEARS print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...") df = ( pl.scan_parquet(input_path) .select(*HEDONIC_COLUMNS) .filter( pl.col("Last known price").is_not_null(), pl.col("Total floor area (sqm)").is_not_null(), pl.col("Total floor area (sqm)") > 0, pl.col("Postcode").is_not_null(), ) .with_columns( pl.col("Date of last transaction").dt.year().alias("sale_year"), type_group_expr(), sector_expr(), ) .filter( pl.col("type_group").is_not_null(), pl.col("sale_year").is_not_null(), pl.col("sale_year") >= min_year, pl.col("sale_year") <= CURRENT_YEAR, ) .collect() ) print(f" {len(df):,} complete cases") return df def train_type_model( df: pl.DataFrame, type_group: str ) -> tuple[float, dict[str, float], dict[str, int], float]: """Train hedonic model for a single property type. Returns (beta_fa, sector_intercepts, sector_counts, national_intercept). """ t_df = df.filter(pl.col("type_group") == type_group) y = np.log(t_df["Last known price"].to_numpy().astype(np.float64)) log_fa = np.log( np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0) ) X = log_fa.reshape(-1, 1) sectors = t_df["sector"].to_list() # Group by sector for demeaning sector_indices: dict[str, list[int]] = {} for i, s in enumerate(sectors): sector_indices.setdefault(s, []).append(i) # Compute sector means and demean X_demeaned = np.empty_like(X) y_demeaned = np.empty_like(y) sector_X_means: dict[str, np.ndarray] = {} sector_y_means: dict[str, float] = {} sector_counts: dict[str, int] = {} for s, idxs in sector_indices.items(): idx = np.array(idxs) X_mean = X[idx].mean(axis=0) y_mean = y[idx].mean() sector_X_means[s] = X_mean sector_y_means[s] = y_mean X_demeaned[idx] = X[idx] - X_mean y_demeaned[idx] = y[idx] - y_mean sector_counts[s] = len(idxs) # OLS on demeaned data beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0] beta_fa = float(beta[0]) # Recover sector intercepts sector_intercepts = {} for s in sector_indices: sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0]) national_intercept = float(np.mean(list(sector_intercepts.values()))) # R-squared y_pred = X[:, 0] * beta_fa for i, s in enumerate(sectors): y_pred[i] += sector_intercepts[s] ss_res = np.sum((y - y_pred) ** 2) ss_tot = np.sum((y - y.mean()) ** 2) r2 = 1 - ss_res / ss_tot print( f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} " f"R²={r2:.4f} sectors={len(sector_intercepts):,}" ) return beta_fa, sector_intercepts, sector_counts, national_intercept def shrink_intercepts( sector_intercepts: dict[str, float], sector_counts: dict[str, int], ) -> dict[str, float]: """Hierarchical shrinkage: sector -> district -> area -> national.""" national = float(np.mean(list(sector_intercepts.values()))) sector_to_dist: dict[str, str] = {} dist_to_area: dict[str, str] = {} for s in sector_intercepts: d, a = hierarchy_keys(s) sector_to_dist[s] = d dist_to_area[d] = a # Area-level intercepts (weighted mean of sectors in area) area_vals: dict[str, list[tuple[float, int]]] = {} for s, val in sector_intercepts.items(): d = sector_to_dist[s] a = dist_to_area[d] area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0))) area_intercepts: dict[str, float] = {} area_counts: dict[str, int] = {} for a, entries in area_vals.items(): total_n = sum(n for _, n in entries) if total_n > 0: area_intercepts[a] = sum(v * n for v, n in entries) / total_n else: area_intercepts[a] = sum(v for v, _ in entries) / len(entries) area_counts[a] = total_n # District-level intercepts dist_vals: dict[str, list[tuple[float, int]]] = {} for s, val in sector_intercepts.items(): d = sector_to_dist[s] dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0))) dist_intercepts: dict[str, float] = {} dist_counts: dict[str, int] = {} for d, entries in dist_vals.items(): total_n = sum(n for _, n in entries) if total_n > 0: dist_intercepts[d] = sum(v * n for v, n in entries) / total_n else: dist_intercepts[d] = sum(v for v, _ in entries) / len(entries) dist_counts[d] = total_n # Shrink: area -> national area_shrunk: dict[str, float] = {} for a, val in area_intercepts.items(): n = area_counts[a] w = n / (n + SHRINKAGE_K) area_shrunk[a] = w * val + (1 - w) * national # Shrink: district -> area dist_shrunk: dict[str, float] = {} for d, val in dist_intercepts.items(): a = dist_to_area[d] parent = area_shrunk.get(a, national) n = dist_counts[d] w = n / (n + SHRINKAGE_K) dist_shrunk[d] = w * val + (1 - w) * parent # Shrink: sector -> district result: dict[str, float] = {} for s, val in sector_intercepts.items(): d = sector_to_dist[s] parent = dist_shrunk.get(d, national) n = sector_counts.get(s, 0) w = n / (n + SHRINKAGE_K) result[s] = w * val + (1 - w) * parent return result def spatial_smooth_intercepts( sector_intercepts: dict[str, float], centroids: dict[str, tuple[float, float]], sector_counts: dict[str, int], ) -> dict[str, float]: """Blend sparse sector intercepts with K nearest neighbors.""" sectors_with_coords = [s for s in sector_intercepts if s in centroids] if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1: return sector_intercepts coords = np.array([centroids[s] for s in sectors_with_coords]) mean_lat = np.mean(coords[:, 0]) scale = np.cos(np.radians(mean_lat)) scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale]) tree = KDTree(scaled_coords) result = dict(sector_intercepts) for i, sec in enumerate(sectors_with_coords): n = sector_counts.get(sec, 0) self_w = n / (n + SPATIAL_BLEND_K) if self_w > 0.95: continue dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1) neighbor_dists = dists[1:] neighbor_idxs = idxs[1:] inv_dists = [] neighbor_vals = [] for d, j in zip(neighbor_dists, neighbor_idxs): ns = sectors_with_coords[j] if d > 0 and ns in sector_intercepts: inv_dists.append(1.0 / d) neighbor_vals.append(sector_intercepts[ns]) if not neighbor_vals: continue total_inv = sum(inv_dists) nbr_w = 1.0 - self_w blended = self_w * sector_intercepts[sec] for val, iw in zip(neighbor_vals, inv_dists): blended += nbr_w * (iw / total_inv) * val result[sec] = blended return result def main(): parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model") parser.add_argument( "--input", type=Path, required=True, help="Path to wide.parquet" ) parser.add_argument( "--output", type=Path, required=True, help="Output hedonic_model.json" ) args = parser.parse_args() df = load_training_data(args.input) centroids = extract_centroids(args.input) print("\nTraining per-type models...") type_models = {} total_sectors = 0 for tg in TYPE_GROUPS: beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg) shrunk = shrink_intercepts(raw_intercepts, sector_counts) smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts) total_sectors += len(smoothed) type_models[tg] = { "beta_fa": beta_fa, "sector_intercepts": smoothed, "national_intercept": national, } # Output args.output.parent.mkdir(parents=True, exist_ok=True) with open(args.output, "w") as f: json.dump({"type_models": type_models}, f, indent=2) size_kb = args.output.stat().st_size / 1024 print(f"\nWrote {args.output} ({size_kb:.0f} KB)") print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts") if __name__ == "__main__": main()