Changes

2026-02-14 12:53:29 +00:00 · 2026-02-14 12:53:29 +00:00 · 128b3191e7
commit 128b3191e7
parent 3a3f899ea2
68 changed files with 28060 additions and 1152 deletions
--- a/pipeline/transform/renovation_premium.py
+++ b/pipeline/transform/renovation_premium.py
@ -0,0 +1,572 @@
+"""Estimate per-area renovation premiums from repeat-sale residuals.
+
+For each repeat-sale pair, computes the residual after removing the price-index
+predicted return. Pairs where renovation events occurred between sales should have
+systematically higher residuals. A WLS regression estimates the log-premium per
+event type, with hierarchical shrinkage and spatial smoothing.
+
+Output: renovation_premium.parquet — sector × type_group × event_type → log_premium
+"""
+
+import argparse
+import math
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import KDTree
+
+from pipeline.transform._price_utils import (
+    SHRINKAGE_K,
+    TYPE_GROUPS,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
+HALF_LIFE = 10.0
+DECAY_RATE = math.log(2) / HALF_LIFE
+OUTLIER_THRESHOLD = 3.0
+MIN_PAIRS = 10
+SPATIAL_NEIGHBORS = 5
+SPATIAL_BLEND_K = 30
+EVENT_TYPES = ["Extension", "Renovation", "Remodeling"]
+
+
+def extract_pairs_with_events(input_path: Path, index_path: Path) -> pl.DataFrame:
+    """Extract repeat-sale pairs with renovation events and index residuals."""
+    print("Extracting repeat-sale pairs with renovation events...")
+
+    df = (
+        pl.scan_parquet(input_path)
+        .select("Postcode", "historical_prices", "Property type", "renovation_history")
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
+        .with_columns(sector_expr(), type_group_expr())
+        .collect()
+    )
+    print(f"  {len(df):,} properties with 2+ transactions")
+
+    # Build consecutive pairs
+    pairs = (
+        df.lazy()
+        .with_columns(
+            pl.col("historical_prices")
+            .list.slice(0, pl.col("historical_prices").list.len() - 1)
+            .alias("from_txn"),
+            pl.col("historical_prices").list.slice(1).alias("to_txn"),
+        )
+        .explode("from_txn", "to_txn")
+        .with_columns(
+            pl.col("from_txn").struct.field("year").alias("year1"),
+            pl.col("from_txn").struct.field("price").alias("price1"),
+            pl.col("to_txn").struct.field("year").alias("year2"),
+            pl.col("to_txn").struct.field("price").alias("price2"),
+        )
+        .select(
+            "sector",
+            "type_group",
+            "year1",
+            "price1",
+            "year2",
+            "price2",
+            "renovation_history",
+        )
+        .filter(
+            pl.col("price1") > 0,
+            pl.col("price2") > 0,
+            pl.col("year2") > pl.col("year1"),
+        )
+        .with_columns(
+            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
+            .log()
+            .alias("log_ratio"),
+        )
+        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
+        .collect()
+    )
+    print(f"  {len(pairs):,} repeat-sale pairs")
+
+    # Join price index to compute residuals
+    index = pl.read_parquet(index_path)
+    has_type_group = "type_group" in index.columns
+
+    if has_type_group:
+        idx_typed = index.filter(pl.col("type_group") != "All")
+        idx_all = index.filter(pl.col("type_group") == "All")
+
+        # Join at year1
+        pairs = pairs.join(
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li1_typed")
+            ),
+            left_on=["sector", "type_group", "year1"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        ).join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li1_all")),
+            left_on=["sector", "year1"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+        # Join at year2
+        pairs = pairs.join(
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li2_typed")
+            ),
+            left_on=["sector", "type_group", "year2"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        ).join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li2_all")),
+            left_on=["sector", "year2"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+
+        pairs = pairs.with_columns(
+            (pl.col("li1_typed").fill_null(pl.col("li1_all"))).alias("_li1"),
+            (pl.col("li2_typed").fill_null(pl.col("li2_all"))).alias("_li2"),
+        )
+    else:
+        pairs = pairs.join(
+            index.select("sector", "year", pl.col("log_index").alias("_li1")),
+            left_on=["sector", "year1"],
+            right_on=["sector", "year"],
+            how="left",
+        ).join(
+            index.select("sector", "year", pl.col("log_index").alias("_li2")),
+            left_on=["sector", "year2"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+
+    # Compute residual = log_ratio - (index2 - index1)
+    pairs = pairs.with_columns(
+        (
+            pl.col("log_ratio")
+            - (pl.col("_li2").fill_null(0.0) - pl.col("_li1").fill_null(0.0))
+        ).alias("residual"),
+        (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
+            "weight"
+        ),
+    )
+
+    # For each pair, compute time-decayed renovation indicators
+    # Use row index for unique identification (composite keys aren't unique per pair)
+    pairs = pairs.with_row_index("_pair_idx")
+
+    for et in EVENT_TYPES:
+        col_name = f"has_{et.lower()}"
+        pairs = pairs.with_columns(pl.lit(0.0).alias(col_name))
+
+    # Process properties that have renovation history
+    has_reno = pairs.filter(
+        pl.col("renovation_history").is_not_null()
+        & (pl.col("renovation_history").list.len() > 0)
+    )
+
+    if len(has_reno) > 0:
+        reno_exploded = (
+            has_reno.select("_pair_idx", "year1", "year2", "renovation_history")
+            .explode("renovation_history")
+            .with_columns(
+                pl.col("renovation_history").struct.field("year").alias("event_year"),
+                pl.col("renovation_history").struct.field("event").alias("event_type"),
+            )
+            # Only events between the two sales
+            .filter(
+                (pl.col("event_year") > pl.col("year1"))
+                & (pl.col("event_year") <= pl.col("year2"))
+            )
+        )
+
+        if len(reno_exploded) > 0:
+            # For each pair + event type, take the most recent event
+            latest_events = reno_exploded.group_by(
+                "_pair_idx", "event_type", "year2"
+            ).agg(pl.col("event_year").max().alias("latest_event_year"))
+
+            # Compute time-decayed indicator: exp(-decay_rate * (year2 - event_year))
+            latest_events = latest_events.with_columns(
+                (
+                    -DECAY_RATE
+                    * (pl.col("year2") - pl.col("latest_event_year")).cast(pl.Float64)
+                )
+                .exp()
+                .alias("decayed_indicator"),
+            )
+
+            # Pivot to wide format using _pair_idx for unique join
+            for et in EVENT_TYPES:
+                et_data = latest_events.filter(pl.col("event_type") == et)
+                if len(et_data) > 0:
+                    col_name = f"has_{et.lower()}"
+                    pairs = (
+                        pairs.join(
+                            et_data.select(
+                                "_pair_idx",
+                                pl.col("decayed_indicator").alias(f"_{col_name}"),
+                            ),
+                            on="_pair_idx",
+                            how="left",
+                        )
+                        .with_columns(
+                            pl.col(f"_{col_name}").fill_null(0.0).alias(col_name),
+                        )
+                        .drop(f"_{col_name}")
+                    )
+
+    pairs = pairs.drop("_pair_idx")
+
+    # Add hierarchy columns
+    pairs = pairs.with_columns(
+        pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
+    ).with_columns(
+        pl.col("district").str.replace(r"\d.*$", "").alias("area"),
+    )
+
+    # Count reno pairs
+    reno_mask = (
+        (pl.col("has_extension") > 0)
+        | (pl.col("has_renovation") > 0)
+        | (pl.col("has_remodeling") > 0)
+    )
+    n_reno = pairs.filter(reno_mask).height
+    print(
+        f"  {n_reno:,} pairs with renovation events ({n_reno / len(pairs) * 100:.1f}%)"
+    )
+
+    # Drop temporary columns from index join + renovation_history (no longer needed)
+    temp_cols = [
+        c
+        for c in pairs.columns
+        if c.startswith("_li") or c.startswith("li1_") or c.startswith("li2_")
+    ]
+    pairs = pairs.drop(temp_cols + ["renovation_history"])
+
+    return pairs
+
+
+def wls_regression(
+    residuals: np.ndarray,
+    weights: np.ndarray,
+    X: np.ndarray,
+) -> np.ndarray:
+    """Weighted least squares: residual ~ X (with intercept column in X).
+
+    Uses sqrt(weights) scaling to avoid building a full N×N diagonal matrix.
+    """
+    sqrt_w = np.sqrt(weights)[:, np.newaxis]
+    Xw = X * sqrt_w
+    yw = residuals * sqrt_w.ravel()
+    try:
+        betas = np.linalg.lstsq(Xw, yw, rcond=None)[0]
+    except np.linalg.LinAlgError:
+        betas = np.zeros(X.shape[1])
+    return betas
+
+
+def compute_premiums_for_group(df: pl.DataFrame) -> dict[str, float]:
+    """Run WLS regression for a group, return {event_type: log_premium}."""
+    n = len(df)
+    if n < MIN_PAIRS:
+        return {}
+
+    residuals = df["residual"].to_numpy().astype(np.float64)
+    weights = df["weight"].to_numpy().astype(np.float64)
+
+    # Build design matrix: intercept + 3 event indicators
+    X = np.column_stack(
+        [
+            np.ones(n),
+            df["has_extension"].to_numpy().astype(np.float64),
+            df["has_renovation"].to_numpy().astype(np.float64),
+            df["has_remodeling"].to_numpy().astype(np.float64),
+        ]
+    )
+
+    # Check if we have any renovation pairs in this group
+    reno_sum = X[:, 1:].sum()
+    if reno_sum < 1.0:
+        return {}
+
+    betas = wls_regression(residuals, weights, X)
+    # betas[0] is intercept, betas[1:4] are the premiums
+    return {
+        "Extension": float(betas[1]),
+        "Renovation": float(betas[2]),
+        "Remodeling": float(betas[3]),
+    }
+
+
+def compute_premiums_for_level(
+    pairs: pl.DataFrame, group_col: str
+) -> tuple[dict, dict]:
+    """Compute premiums per group at a given hierarchy level.
+
+    Returns (premiums, n_reno_pairs) dicts keyed by group value.
+    premiums[key] = {event_type: log_premium}
+    """
+    groups = pairs.group_by(group_col)
+    premiums = {}
+    n_reno_pairs = {}
+    for key, group_df in groups:
+        key_val = key[0]
+        result = compute_premiums_for_group(group_df)
+        if result:
+            premiums[key_val] = result
+            # Count pairs with any reno indicator
+            reno_mask = (
+                (group_df["has_extension"].to_numpy() > 0)
+                | (group_df["has_renovation"].to_numpy() > 0)
+                | (group_df["has_remodeling"].to_numpy() > 0)
+            )
+            n_reno_pairs[key_val] = int(reno_mask.sum())
+    return premiums, n_reno_pairs
+
+
+def shrink_premium(
+    raw: dict[str, float], parent: dict[str, float], n: int
+) -> dict[str, float]:
+    """Shrink raw premiums toward parent level."""
+    w = n / (n + SHRINKAGE_K)
+    result = {}
+    for et in EVENT_TYPES:
+        r = raw.get(et, parent.get(et, 0.0))
+        p = parent.get(et, raw.get(et, 0.0))
+        result[et] = w * r + (1 - w) * p
+    return result
+
+
+def apply_shrinkage(
+    sector_prem,
+    sector_n,
+    district_prem,
+    district_n,
+    area_prem,
+    area_n,
+    national_prem,
+    national_n,
+    all_sectors,
+    sector_to_dist,
+    dist_to_area,
+):
+    """Top-down hierarchical shrinkage for premiums."""
+    # Area -> national
+    area_shrunk = {}
+    for area, prem in area_prem.items():
+        area_shrunk[area] = shrink_premium(prem, national_prem, area_n.get(area, 0))
+
+    # District -> area
+    district_shrunk = {}
+    for dist, prem in district_prem.items():
+        a = dist_to_area.get(dist, "")
+        parent = area_shrunk.get(a, national_prem)
+        district_shrunk[dist] = shrink_premium(prem, parent, district_n.get(dist, 0))
+
+    # Sector -> district
+    sector_shrunk = {}
+    for sec, prem in sector_prem.items():
+        d = sector_to_dist.get(sec, "")
+        parent = district_shrunk.get(d, national_prem)
+        sector_shrunk[sec] = shrink_premium(prem, parent, sector_n.get(sec, 0))
+
+    # Fill missing sectors
+    for sec in all_sectors:
+        if sec not in sector_shrunk:
+            d = sector_to_dist.get(sec, "")
+            a = dist_to_area.get(d, "")
+            sector_shrunk[sec] = district_shrunk.get(
+                d, area_shrunk.get(a, national_prem)
+            )
+
+    return sector_shrunk
+
+
+def spatial_smooth(
+    sector_premiums: dict[str, dict[str, float]],
+    centroids: dict[str, tuple[float, float]],
+    n_reno_map: dict[str, int],
+) -> dict[str, dict[str, float]]:
+    """Blend sparse sector premiums with K nearest neighbors."""
+    sectors_with_coords = [s for s in sector_premiums if s in centroids]
+    if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
+        return sector_premiums
+
+    coords = np.array([centroids[s] for s in sectors_with_coords])
+    mean_lat = np.mean(coords[:, 0])
+    scale = np.cos(np.radians(mean_lat))
+    scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
+    tree = KDTree(scaled_coords)
+
+    result = dict(sector_premiums)
+    for i, sec in enumerate(sectors_with_coords):
+        n = n_reno_map.get(sec, 0)
+        self_w = n / (n + SPATIAL_BLEND_K)
+        if self_w > 0.95:
+            continue
+
+        dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
+        neighbor_dists = dists[1:]
+        neighbor_idxs = idxs[1:]
+
+        inv_dists = []
+        neighbor_prems = []
+        for d, j in zip(neighbor_dists, neighbor_idxs):
+            ns = sectors_with_coords[j]
+            if d > 0 and ns in sector_premiums:
+                inv_dists.append(1.0 / d)
+                neighbor_prems.append(sector_premiums[ns])
+
+        if not neighbor_prems:
+            continue
+
+        total_inv = sum(inv_dists)
+        nbr_w = 1.0 - self_w
+        ws = [iw / total_inv * nbr_w for iw in inv_dists]
+
+        blended = {}
+        for et in EVENT_TYPES:
+            val = self_w * sector_premiums[sec].get(et, 0.0)
+            for np_dict, w in zip(neighbor_prems, ws):
+                val += w * np_dict.get(et, 0.0)
+            blended[et] = val
+        result[sec] = blended
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Estimate renovation premiums from repeat-sale residuals"
+    )
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output renovation_premium.parquet"
+    )
+    args = parser.parse_args()
+
+    pairs = extract_pairs_with_events(args.input, args.index)
+    centroids = extract_centroids(args.input)
+
+    # Precompute hierarchy
+    all_sectors = pairs["sector"].unique().to_list()
+    sector_to_dist = {}
+    dist_to_area = {}
+    for s in all_sectors:
+        d, a = hierarchy_keys(s)
+        sector_to_dist[s] = d
+        dist_to_area[d] = a
+
+    all_type_groups = ["All"] + TYPE_GROUPS
+    rows = []
+
+    for tg in all_type_groups:
+        print(f"\n--- {tg} ---")
+        typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
+        if len(typed) < MIN_PAIRS:
+            print(f"  Skipping (only {len(typed)} pairs)")
+            continue
+
+        print(f"  {len(typed):,} pairs")
+
+        # National
+        national_prem = compute_premiums_for_group(typed)
+        national_reno = typed.filter(
+            (pl.col("has_extension") > 0)
+            | (pl.col("has_renovation") > 0)
+            | (pl.col("has_remodeling") > 0)
+        ).height
+        if not national_prem:
+            print("  No renovation pairs at national level, skipping")
+            continue
+
+        print(
+            "  National premiums: "
+            + ", ".join(
+                f"{et}: {v:.4f} ({math.exp(v) - 1:.1%})"
+                for et, v in national_prem.items()
+            )
+        )
+
+        # Per-level
+        print("  Computing per-level premiums:")
+        area_prem, area_n = compute_premiums_for_level(typed, "area")
+        district_prem, district_n = compute_premiums_for_level(typed, "district")
+        sector_prem, sector_n = compute_premiums_for_level(typed, "sector")
+        print(
+            f"  {len(area_prem)} areas, {len(district_prem)} districts, {len(sector_prem)} sectors with data"
+        )
+
+        # Shrinkage
+        print("  Applying shrinkage...")
+        sector_shrunk = apply_shrinkage(
+            sector_prem,
+            sector_n,
+            district_prem,
+            district_n,
+            area_prem,
+            area_n,
+            national_prem,
+            national_reno,
+            all_sectors,
+            sector_to_dist,
+            dist_to_area,
+        )
+
+        # Spatial smoothing
+        print("  Spatial smoothing...")
+        sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
+
+        # Collect rows
+        for sec in all_sectors:
+            prem = sector_smoothed.get(sec, national_prem)
+            n = sector_n.get(sec, 0)
+            for et in EVENT_TYPES:
+                rows.append((sec, tg, et, prem.get(et, 0.0), n))
+
+    result = pl.DataFrame(
+        rows,
+        schema={
+            "sector": pl.String,
+            "type_group": pl.String,
+            "event_type": pl.String,
+            "log_premium": pl.Float64,
+            "n_reno_pairs": pl.Int64,
+        },
+        orient="row",
+    ).sort("type_group", "sector", "event_type")
+
+    result.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
+    print(
+        f"  {result['sector'].n_unique():,} sectors x {len(all_type_groups)} types x {len(EVENT_TYPES)} events = {len(result):,} rows"
+    )
+
+    # Print summary statistics
+    print("\nNational premium summary:")
+    national = (
+        result.filter(pl.col("type_group") == "All")
+        .group_by("event_type")
+        .agg(
+            pl.col("log_premium").mean().alias("mean_log_premium"),
+        )
+    )
+    for row in national.iter_rows(named=True):
+        et = row["event_type"]
+        lp = row["mean_log_premium"]
+        print(f"  {et}: log_premium={lp:.4f} ({math.exp(lp) - 1:.1%} price uplift)")
+
+
+if __name__ == "__main__":
+    main()