Changes

2026-02-14 12:53:29 +00:00 · 2026-02-14 12:53:29 +00:00 · 128b3191e7
commit 128b3191e7
parent 3a3f899ea2
68 changed files with 28060 additions and 1152 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -1,6 +1,7 @@
-"""Extract place=* nodes from OSM PBF → data/places.parquet.
+"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.

-Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
+Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
+(tube, national rail, DLR, etc.) for typeahead search.
 Reuses the same great-britain-latest.osm.pbf as pois.py.
 """

@ -18,13 +19,54 @@ PLACE_TYPES = {
    "borough",
    "town",
    "suburb",
+    "quarter",
    "neighbourhood",
    "village",
    "hamlet",
    "locality",
+    "island",
    "isolated_dwelling",
 }

+# Suffixes to strip from raw station names before appending the typed suffix.
+_STATION_STRIP = (
+    " tube station",
+    " underground station",
+    " railway station",
+    " dlr station",
+    " overground station",
+    " tram stop",
+    " station",
+)
+
+
+def _station_display_name(name: str, tags: dict[str, str]) -> str:
+    """Build a descriptive station name like 'Bank tube station'."""
+    station_tag = tags.get("station", "")
+    network = tags.get("network", "").lower()
+
+    if station_tag == "subway" or "underground" in network:
+        suffix = "tube station"
+    elif "docklands" in network or "dlr" in network:
+        suffix = "DLR station"
+    elif "overground" in network:
+        suffix = "overground station"
+    elif "elizabeth" in network:
+        suffix = "Elizabeth line station"
+    elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
+        suffix = "tram stop"
+    else:
+        suffix = "railway station"
+
+    # Strip any existing station suffix from the raw name
+    lower = name.lower()
+    for s in _STATION_STRIP:
+        if lower.endswith(s):
+            name = name[: len(name) - len(s)].rstrip()
+            break
+
+    return f"{name} {suffix}"
+

 class PlaceHandler(osmium.SimpleHandler):
    def __init__(self, progress: tqdm) -> None:
@ -32,6 +74,12 @@ class PlaceHandler(osmium.SimpleHandler):
        self._progress = progress
        self.places: list[dict] = []

+    def _add(self, name: str, place_type: str, lat: float, lon: float, population: int) -> None:
+        self.places.append(
+            {"name": name, "place_type": place_type, "lat": lat, "lon": lon, "population": population}
+        )
+        self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
+
    def node(self, n: osmium.osm.Node) -> None:
        self._progress.update(1)
        if not n.location.valid:
@ -39,16 +87,28 @@ class PlaceHandler(osmium.SimpleHandler):
        lat, lon = n.location.lat, n.location.lon
        if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
            return
-        place_type = n.tags.get("place")
-        if place_type not in PLACE_TYPES:
-            return
+
        name = n.tags.get("name:en", n.tags.get("name", ""))
        if not name:
            return
-        self.places.append(
-            {"name": name, "place_type": place_type, "lat": lat, "lon": lon}
-        )
-        self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
+
+        pop_str = n.tags.get("population", "")
+        try:
+            population = int(pop_str)
+        except ValueError:
+            population = 0
+
+        # place=* nodes (cities, towns, suburbs, etc.)
+        place_type = n.tags.get("place")
+        if place_type in PLACE_TYPES:
+            self._add(name, place_type, lat, lon, population)
+            return
+
+        # railway=station nodes (tube, national rail, DLR, tram, etc.)
+        if n.tags.get("railway") == "station":
+            display_name = _station_display_name(name, dict(n.tags))
+            self._add(display_name, "station", lat, lon, population)
+            return


 def main() -> None:
@ -73,7 +133,7 @@ def main() -> None:
        else:
            print(f"Using cached PBF: {pbf_file}")

-    print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
+    print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
    with tqdm(
        unit=" elements",
        unit_scale=True,
--- a/pipeline/transform/_price_utils.py
+++ b/pipeline/transform/_price_utils.py
@ -0,0 +1,121 @@
+"""Shared utilities for price index, price estimate, and renovation premium scripts."""
+
+import numpy as np
+import polars as pl
+
+CURRENT_YEAR = 2025
+TERRACE_TYPES = [
+    "Mid-Terrace",
+    "End-Terrace",
+    "Enclosed Mid-Terrace",
+    "Enclosed End-Terrace",
+    "Terraced",
+]
+FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
+TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
+SHRINKAGE_K = 50
+
+
+def type_group_expr():
+    """Polars expression: Property type -> type_group."""
+    return (
+        pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
+        .then(pl.lit("Terraced"))
+        .when(pl.col("Property type").is_in(FLAT_TYPES))
+        .then(pl.lit("Flats"))
+        .when(pl.col("Property type") == "Bungalow")
+        .then(pl.lit("Bungalow"))
+        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
+        .then(pl.col("Property type"))
+        .otherwise(pl.lit(None))
+        .alias("type_group")
+    )
+
+
+def sector_expr():
+    """Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
+    return (
+        pl.col("Postcode")
+        .str.slice(0, pl.col("Postcode").str.len_chars() - 2)
+        .str.strip_chars()
+        .alias("sector")
+    )
+
+
+def hierarchy_keys(sector: str) -> tuple[str, str]:
+    """Return (district, area) for a sector string."""
+    district = sector.rsplit(" ", 1)[0] if " " in sector else sector
+    area = ""
+    for ch in district:
+        if ch.isalpha():
+            area += ch
+        else:
+            break
+    return district, area
+
+
+AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
+AGE_LABELS = [
+    "pre-1900",
+    "1900-1929",
+    "1930-1949",
+    "1950-1966",
+    "1967-1982",
+    "1983-1999",
+    "2000-2009",
+    "2010+",
+]
+
+HEDONIC_COLUMNS = [
+    "Last known price",
+    "Date of last transaction",
+    "Property type",
+    "Total floor area (sqm)",
+    "Postcode",
+]
+
+
+def age_band_expr():
+    """Polars expression: Construction age (UInt16 year) → age band string."""
+    expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
+    for i, brk in enumerate(AGE_BREAKS):
+        expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
+    return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
+
+
+NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
+
+
+def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
+    """Build hedonic feature matrix from a DataFrame with type_group column.
+
+    Columns (5 total): log(floor_area), 4 type dummies (ref: Detached).
+    Sector fixed effects do the heavy lifting — additional property features
+    (EPC, rooms, age) add no predictive value after sector demeaning.
+    """
+    fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
+    log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
+    tg = df["type_group"].to_numpy()
+    parts = [log_fa]
+    for t in NON_REF_TYPES:
+        parts.append((tg == t).astype(np.float32).reshape(-1, 1))
+    return np.hstack(parts)
+
+
+def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
+    """Compute mean lat/lon per postcode sector."""
+    print("Computing sector centroids...")
+    df = (
+        pl.scan_parquet(input_path)
+        .select("Postcode", "lat", "lon")
+        .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
+        .with_columns(sector_expr())
+        .group_by("sector")
+        .agg(pl.col("lat").mean(), pl.col("lon").mean())
+        .collect()
+    )
+    centroids = {}
+    for row in df.iter_rows(named=True):
+        centroids[row["sector"]] = (row["lat"], row["lon"])
+    print(f"  {len(centroids):,} sector centroids")
+    return centroids
--- a/pipeline/transform/hedonic_quality.py
+++ b/pipeline/transform/hedonic_quality.py
@ -0,0 +1,300 @@
+"""Cross-Sectional Hedonic Model (Per-Type)
+
+Trains separate OLS models per property type on recent sales (last 5 years)
+with sector fixed effects via Frisch-Waugh-Lovell demeaning:
+
+    log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
+
+Each type gets its own floor area elasticity and sector intercepts, capturing
+that detached houses (beta=0.74) have higher price sensitivity to size than
+terraced houses (beta=0.60), and a sector's value differs by property type.
+
+Sector intercepts are hierarchically shrunk (sector → district → area → national)
+and spatially smoothed via KD-tree nearest neighbors.
+
+Output: hedonic_model.json with per-type betas and sector intercepts.
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import KDTree
+
+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    HEDONIC_COLUMNS,
+    SHRINKAGE_K,
+    TYPE_GROUPS,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
+TRAINING_YEARS = 5
+SPATIAL_NEIGHBORS = 5
+SPATIAL_BLEND_K = 30
+
+
+def load_training_data(input_path: Path) -> pl.DataFrame:
+    """Load recent sales with complete hedonic features."""
+    min_year = CURRENT_YEAR - TRAINING_YEARS
+    print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
+    df = (
+        pl.scan_parquet(input_path)
+        .select(*HEDONIC_COLUMNS)
+        .filter(
+            pl.col("Last known price").is_not_null(),
+            pl.col("Total floor area (sqm)").is_not_null(),
+            pl.col("Total floor area (sqm)") > 0,
+            pl.col("Postcode").is_not_null(),
+        )
+        .with_columns(
+            pl.col("Date of last transaction").dt.year().alias("sale_year"),
+            type_group_expr(),
+            sector_expr(),
+        )
+        .filter(
+            pl.col("type_group").is_not_null(),
+            pl.col("sale_year").is_not_null(),
+            pl.col("sale_year") >= min_year,
+            pl.col("sale_year") <= CURRENT_YEAR,
+        )
+        .collect()
+    )
+    print(f"  {len(df):,} complete cases")
+    return df
+
+
+def train_type_model(
+    df: pl.DataFrame, type_group: str
+) -> tuple[float, dict[str, float], dict[str, int], float]:
+    """Train hedonic model for a single property type.
+
+    Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
+    """
+    t_df = df.filter(pl.col("type_group") == type_group)
+    y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
+    log_fa = np.log(
+        np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
+    )
+    X = log_fa.reshape(-1, 1)
+    sectors = t_df["sector"].to_list()
+
+    # Group by sector for demeaning
+    sector_indices: dict[str, list[int]] = {}
+    for i, s in enumerate(sectors):
+        sector_indices.setdefault(s, []).append(i)
+
+    # Compute sector means and demean
+    X_demeaned = np.empty_like(X)
+    y_demeaned = np.empty_like(y)
+    sector_X_means: dict[str, np.ndarray] = {}
+    sector_y_means: dict[str, float] = {}
+    sector_counts: dict[str, int] = {}
+
+    for s, idxs in sector_indices.items():
+        idx = np.array(idxs)
+        X_mean = X[idx].mean(axis=0)
+        y_mean = y[idx].mean()
+        sector_X_means[s] = X_mean
+        sector_y_means[s] = y_mean
+        X_demeaned[idx] = X[idx] - X_mean
+        y_demeaned[idx] = y[idx] - y_mean
+        sector_counts[s] = len(idxs)
+
+    # OLS on demeaned data
+    beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
+    beta_fa = float(beta[0])
+
+    # Recover sector intercepts
+    sector_intercepts = {}
+    for s in sector_indices:
+        sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
+
+    national_intercept = float(np.mean(list(sector_intercepts.values())))
+
+    # R-squared
+    y_pred = X[:, 0] * beta_fa
+    for i, s in enumerate(sectors):
+        y_pred[i] += sector_intercepts[s]
+    ss_res = np.sum((y - y_pred) ** 2)
+    ss_tot = np.sum((y - y.mean()) ** 2)
+    r2 = 1 - ss_res / ss_tot
+
+    print(
+        f"  {type_group:<15s}: n={len(t_df):>9,}  β_fa={beta_fa:.4f}  "
+        f"R²={r2:.4f}  sectors={len(sector_intercepts):,}"
+    )
+
+    return beta_fa, sector_intercepts, sector_counts, national_intercept
+
+
+def shrink_intercepts(
+    sector_intercepts: dict[str, float],
+    sector_counts: dict[str, int],
+) -> dict[str, float]:
+    """Hierarchical shrinkage: sector -> district -> area -> national."""
+    national = float(np.mean(list(sector_intercepts.values())))
+
+    sector_to_dist: dict[str, str] = {}
+    dist_to_area: dict[str, str] = {}
+    for s in sector_intercepts:
+        d, a = hierarchy_keys(s)
+        sector_to_dist[s] = d
+        dist_to_area[d] = a
+
+    # Area-level intercepts (weighted mean of sectors in area)
+    area_vals: dict[str, list[tuple[float, int]]] = {}
+    for s, val in sector_intercepts.items():
+        d = sector_to_dist[s]
+        a = dist_to_area[d]
+        area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
+
+    area_intercepts: dict[str, float] = {}
+    area_counts: dict[str, int] = {}
+    for a, entries in area_vals.items():
+        total_n = sum(n for _, n in entries)
+        if total_n > 0:
+            area_intercepts[a] = sum(v * n for v, n in entries) / total_n
+        else:
+            area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
+        area_counts[a] = total_n
+
+    # District-level intercepts
+    dist_vals: dict[str, list[tuple[float, int]]] = {}
+    for s, val in sector_intercepts.items():
+        d = sector_to_dist[s]
+        dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
+
+    dist_intercepts: dict[str, float] = {}
+    dist_counts: dict[str, int] = {}
+    for d, entries in dist_vals.items():
+        total_n = sum(n for _, n in entries)
+        if total_n > 0:
+            dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
+        else:
+            dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
+        dist_counts[d] = total_n
+
+    # Shrink: area -> national
+    area_shrunk: dict[str, float] = {}
+    for a, val in area_intercepts.items():
+        n = area_counts[a]
+        w = n / (n + SHRINKAGE_K)
+        area_shrunk[a] = w * val + (1 - w) * national
+
+    # Shrink: district -> area
+    dist_shrunk: dict[str, float] = {}
+    for d, val in dist_intercepts.items():
+        a = dist_to_area[d]
+        parent = area_shrunk.get(a, national)
+        n = dist_counts[d]
+        w = n / (n + SHRINKAGE_K)
+        dist_shrunk[d] = w * val + (1 - w) * parent
+
+    # Shrink: sector -> district
+    result: dict[str, float] = {}
+    for s, val in sector_intercepts.items():
+        d = sector_to_dist[s]
+        parent = dist_shrunk.get(d, national)
+        n = sector_counts.get(s, 0)
+        w = n / (n + SHRINKAGE_K)
+        result[s] = w * val + (1 - w) * parent
+
+    return result
+
+
+def spatial_smooth_intercepts(
+    sector_intercepts: dict[str, float],
+    centroids: dict[str, tuple[float, float]],
+    sector_counts: dict[str, int],
+) -> dict[str, float]:
+    """Blend sparse sector intercepts with K nearest neighbors."""
+    sectors_with_coords = [s for s in sector_intercepts if s in centroids]
+    if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
+        return sector_intercepts
+
+    coords = np.array([centroids[s] for s in sectors_with_coords])
+    mean_lat = np.mean(coords[:, 0])
+    scale = np.cos(np.radians(mean_lat))
+    scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
+    tree = KDTree(scaled_coords)
+
+    result = dict(sector_intercepts)
+    for i, sec in enumerate(sectors_with_coords):
+        n = sector_counts.get(sec, 0)
+        self_w = n / (n + SPATIAL_BLEND_K)
+        if self_w > 0.95:
+            continue
+
+        dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
+        neighbor_dists = dists[1:]
+        neighbor_idxs = idxs[1:]
+
+        inv_dists = []
+        neighbor_vals = []
+        for d, j in zip(neighbor_dists, neighbor_idxs):
+            ns = sectors_with_coords[j]
+            if d > 0 and ns in sector_intercepts:
+                inv_dists.append(1.0 / d)
+                neighbor_vals.append(sector_intercepts[ns])
+
+        if not neighbor_vals:
+            continue
+
+        total_inv = sum(inv_dists)
+        nbr_w = 1.0 - self_w
+        blended = self_w * sector_intercepts[sec]
+        for val, iw in zip(neighbor_vals, inv_dists):
+            blended += nbr_w * (iw / total_inv) * val
+        result[sec] = blended
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output hedonic_model.json"
+    )
+    args = parser.parse_args()
+
+    df = load_training_data(args.input)
+    centroids = extract_centroids(args.input)
+
+    print("\nTraining per-type models...")
+    type_models = {}
+    total_sectors = 0
+
+    for tg in TYPE_GROUPS:
+        beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
+
+        shrunk = shrink_intercepts(raw_intercepts, sector_counts)
+        smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
+        total_sectors += len(smoothed)
+
+        type_models[tg] = {
+            "beta_fa": beta_fa,
+            "sector_intercepts": smoothed,
+            "national_intercept": national,
+        }
+
+    # Output
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump({"type_models": type_models}, f, indent=2)
+
+    size_kb = args.output.stat().st_size / 1024
+    print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
+    print(f"  {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -223,7 +223,6 @@ def _build_wide(
        )
        .drop(
            "inspection_date",
-            "floor_height",
            "_bedrooms",
            "LSOA name (2021)",
            "Local Authority District code (2024)",
@ -276,6 +275,7 @@ def _build_wide(
                "shrink_swell_risk": "Shrink-swell risk",
                "soluble_rocks_risk": "Soluble rocks risk",
                "median_monthly_rent": "Estimated monthly rent",
+                "floor_height": "Interior height (m)",
            }
        )
    )
--- a/pipeline/transform/price_backtest.py
+++ b/pipeline/transform/price_backtest.py
@ -9,45 +9,60 @@ Output: backtest_results.parquet with predictions vs actuals.
 """

 import argparse
+import json
 from pathlib import Path

 import numpy as np
 import polars as pl

-CURRENT_YEAR = 2025
+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    HEDONIC_COLUMNS,
+    sector_expr,
+    type_group_expr,
+)
+
 TEST_YEAR_MIN = 2022
-TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]


-def type_group_expr():
-    return (
-        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
-        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
-        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
-        .otherwise(pl.lit(None))
-        .alias("type_group")
-    )
-
-
-def extract_test_set(input_path: Path) -> pl.DataFrame:
+def extract_test_set(
+    input_path: Path, include_hedonic_cols: bool = False
+) -> pl.DataFrame:
    """Extract test pairs: second-to-last sale as input, last sale as ground truth."""
    print("Loading test set...")
+    cols = ["Postcode", "historical_prices", "Property type"]
+    if include_hedonic_cols:
+        for c in HEDONIC_COLUMNS:
+            if c not in cols:
+                cols.append(c)
    df = (
        pl.scan_parquet(input_path)
-        .select("Postcode", "historical_prices", "Property type")
+        .select(cols)
        .filter(
            pl.col("Postcode").is_not_null(),
            pl.col("historical_prices").list.len() >= 2,
        )
        .with_columns(
-            pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
+            sector_expr(),
            type_group_expr(),
            # Last sale (ground truth)
-            pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
-            pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("year")
+            .alias("actual_year"),
+            pl.col("historical_prices")
+            .list.last()
+            .struct.field("price")
+            .alias("actual_price"),
            # Second-to-last sale (input)
-            pl.col("historical_prices").list.get(-2).struct.field("year").alias("input_year"),
-            pl.col("historical_prices").list.get(-2).struct.field("price").alias("input_price"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("year")
+            .alias("input_year"),
+            pl.col("historical_prices")
+            .list.get(-2)
+            .struct.field("price")
+            .alias("input_price"),
        )
        .filter(
            pl.col("actual_year") >= TEST_YEAR_MIN,
@ -71,7 +86,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:

        # Join type-specific index at input year
        test = test.join(
-            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
+            ),
            left_on=["sector", "type_group", "input_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
@ -85,7 +102,12 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        )
        # Join type-specific index at actual year
        test = test.join(
-            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
+            idx_typed.select(
+                "sector",
+                "type_group",
+                "year",
+                pl.col("log_index").alias("li_act_typed"),
+            ),
            left_on=["sector", "type_group", "actual_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
@ -99,19 +121,27 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        )

        test = test.with_columns(
-            pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
-            pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
+            pl.col("li_in_typed")
+            .fill_null(pl.col("li_in_all"))
+            .alias("log_index_input"),
+            pl.col("li_act_typed")
+            .fill_null(pl.col("li_act_all"))
+            .alias("log_index_actual"),
        )
    else:
        # Unstratified index
        test = test.join(
-            index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
+            index.select(
+                "sector", "year", pl.col("log_index").alias("log_index_input")
+            ),
            left_on=["sector", "input_year"],
            right_on=["sector", "year"],
            how="left",
        )
        test = test.join(
-            index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
+            index.select(
+                "sector", "year", pl.col("log_index").alias("log_index_actual")
+            ),
            left_on=["sector", "actual_year"],
            right_on=["sector", "year"],
            how="left",
@ -121,7 +151,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
        (
            pl.col("input_price").cast(pl.Float64)
            * (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
-        ).fill_null(pl.col("input_price").cast(pl.Float64)).alias("predicted"),
+        )
+        .fill_null(pl.col("input_price").cast(pl.Float64))
+        .alias("predicted"),
    )
    return test

@ -150,7 +182,15 @@ def print_metrics_table(metrics_by_stage: dict):
    print("BACKTEST RESULTS")
    print("=" * 55)

-    metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
+    metric_names = [
+        "MdAPE (%)",
+        "% within 10%",
+        "% within 20%",
+        "% within 30%",
+        "MAE (£)",
+        "Mean signed error (£)",
+        "n",
+    ]
    stages = list(metrics_by_stage.keys())

    header = f"{'Metric':<25s}"
@ -176,20 +216,37 @@ def print_metrics_table(metrics_by_stage: dict):

 def main():
    parser = argparse.ArgumentParser(description="Backtest price estimation model")
-    parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
-    parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
-    parser.add_argument("--output", type=Path, required=True, help="Output backtest_results.parquet")
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output backtest_results.parquet"
+    )
+    parser.add_argument(
+        "--hedonic-model",
+        type=Path,
+        default=None,
+        help="Path to hedonic_model.json (optional)",
+    )
    args = parser.parse_args()

    index = pl.read_parquet(args.index)
    has_type_group = "type_group" in index.columns
    if has_type_group:
-        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
-              f"{index['type_group'].n_unique()} type groups")
+        print(
+            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+            f"{index['type_group'].n_unique()} type groups"
+        )
    else:
-        print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
+        print(
+            f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
+        )

-    test = extract_test_set(args.input)
+    has_hedonic = args.hedonic_model is not None
+    test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)

    print("\nPredicting with price index...")
    test = predict(test, index)
@ -197,19 +254,126 @@ def main():
    # Compute and print metrics
    actual = test["actual_price"].to_numpy().astype(np.float64)
    metrics = {
-        "Naive": compute_metrics(actual, test["input_price"].to_numpy().astype(np.float64)),
-        "Index": compute_metrics(actual, test["predicted"].to_numpy().astype(np.float64)),
+        "Naive": compute_metrics(
+            actual, test["input_price"].to_numpy().astype(np.float64)
+        ),
+        "Index": compute_metrics(
+            actual, test["predicted"].to_numpy().astype(np.float64)
+        ),
    }

+    # Hedonic blending
+    if has_hedonic:
+        print("\nApplying hedonic blending...")
+        with open(args.hedonic_model) as f:
+            model = json.load(f)
+        type_models = model["type_models"]
+
+        # Identify eligible rows for hedonic estimate
+        hedonic_mask = (
+            pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & pl.col("type_group").is_not_null()
+        )
+        eligible_mask = test.select(hedonic_mask).to_series()
+        eligible = test.filter(eligible_mask)
+
+        if len(eligible) > 0:
+            log_fa = np.log(
+                np.maximum(
+                    eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
+                    1.0,
+                )
+            )
+            sectors = eligible["sector"].to_list()
+            types = eligible["type_group"].to_list()
+
+            # Per-type hedonic prediction
+            log_hedonic = np.empty(len(eligible))
+            for i in range(len(eligible)):
+                tm = type_models.get(types[i])
+                if tm is None:
+                    log_hedonic[i] = np.nan
+                    continue
+                alpha = tm["sector_intercepts"].get(
+                    sectors[i], tm["national_intercept"]
+                )
+                log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
+
+            valid = np.isfinite(log_hedonic)
+
+            # Hold years: input_year to actual_year (simulating real prediction)
+            input_years = eligible["input_year"].to_numpy().astype(np.float64)
+            actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
+            hold_years = np.maximum(actual_years - input_years, 0.0)
+
+            log_index_pred = np.log(
+                np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
+            )
+
+            # Sweep tau values (only on valid hedonic rows)
+            tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
+            actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
+            best_tau = 15.0
+            best_mdape = float("inf")
+
+            print(f"\n  tau sweep ({valid.sum():,} eligible properties):")
+            for tau in tau_values:
+                blend_w = hold_years / (hold_years + tau)
+                log_blended = np.where(
+                    valid,
+                    (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
+                    log_index_pred,
+                )
+                blended = np.exp(log_blended)
+                m = compute_metrics(actual_eligible, blended)
+                marker = ""
+                if m["MdAPE (%)"] < best_mdape:
+                    best_mdape = m["MdAPE (%)"]
+                    best_tau = tau
+                    marker = " <-- best"
+                print(
+                    f"    tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
+                    f"within 10%={m['% within 10%']:>5.1f}%{marker}"
+                )
+
+            print(f"\n  Best tau = {best_tau}")
+
+            # Compute blended predictions with best tau for full test set
+            blend_w = hold_years / (hold_years + best_tau)
+            log_blended = np.where(
+                valid,
+                (1 - blend_w) * log_index_pred + blend_w * log_hedonic,
+                log_index_pred,
+            )
+            blended_eligible = np.exp(log_blended)
+
+            # Merge back: for non-eligible rows, use index prediction
+            blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
+            eligible_indices = eligible_mask.arg_true()
+            for i, idx in enumerate(eligible_indices):
+                blended_all[idx] = blended_eligible[i]
+
+            test = test.with_columns(
+                pl.Series("blended", blended_all, dtype=pl.Float64),
+            )
+            metrics["Blended"] = compute_metrics(actual, blended_all)
+
    print_metrics_table(metrics)

    # Save results
-    result = test.select(
-        "Postcode", "sector",
-        "input_year", "input_price",
-        "actual_year", "actual_price",
+    result_cols = [
+        "Postcode",
+        "sector",
+        "input_year",
+        "input_price",
+        "actual_year",
+        "actual_price",
        "predicted",
-    )
+    ]
+    if "blended" in test.columns:
+        result_cols.append("blended")
+    result = test.select(result_cols)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
--- a/pipeline/transform/price_estimate.py
+++ b/pipeline/transform/price_estimate.py
@ -4,32 +4,56 @@ Joins the precomputed repeat-sales price index (from price_index.py) with each
 property's last known sale to produce an inflation-adjusted current price estimate.
 Uses type-stratified index when available, falling back to "All" type.

+Optionally applies renovation premiums from renovation_premium.py: for properties
+with post-sale renovation events, the estimated price is adjusted upward based on
+data-driven per-area premiums with time decay.
+
 Modifies wide.parquet in-place, adding the "Estimated current price" column.
 """

 import argparse
+import json
+import math
 from pathlib import Path

+import numpy as np
 import polars as pl

-CURRENT_YEAR = 2025
-TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    sector_expr,
+    type_group_expr,
+)

-
-def type_group_expr():
-    return (
-        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
-        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
-        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
-        .otherwise(pl.lit(None))
-        .alias("type_group")
-    )
+HALF_LIFE = 10.0
+DECAY_RATE = math.log(2) / HALF_LIFE


 def main():
-    parser = argparse.ArgumentParser(description="Augment wide.parquet with estimated current prices")
-    parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)")
-    parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
+    parser = argparse.ArgumentParser(
+        description="Augment wide.parquet with estimated current prices"
+    )
+    parser.add_argument(
+        "--input",
+        type=Path,
+        required=True,
+        help="Path to wide.parquet (modified in-place)",
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    parser.add_argument(
+        "--renovation-premium",
+        type=Path,
+        default=None,
+        help="Path to renovation_premium.parquet (optional)",
+    )
+    parser.add_argument(
+        "--hedonic-model",
+        type=Path,
+        default=None,
+        help="Path to hedonic_model.json (optional)",
+    )
    args = parser.parse_args()

    print("Loading wide.parquet...")
@ -49,7 +73,7 @@ def main():
    )

    df = df.with_columns(
-        pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("_sector"),
+        sector_expr().alias("_sector"),
        pl.col("Date of last transaction").dt.year().alias("_sale_year"),
        type_group_expr().alias("_type_group"),
    )
@ -57,10 +81,14 @@ def main():
    index = pl.read_parquet(args.index)
    has_type_group = "type_group" in index.columns
    if has_type_group:
-        print(f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
-              f"{index['type_group'].n_unique()} type groups")
+        print(
+            f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
+            f"{index['type_group'].n_unique()} type groups"
+        )
    else:
-        print(f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)")
+        print(
+            f"  Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
+        )

    print("\nApplying repeat-sales index...")

@ -70,49 +98,63 @@ def main():

        # Join type-specific index at sale year
        df = df.join(
-            idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed")),
+            idx_typed.select(
+                "sector",
+                "type_group",
+                "year",
+                pl.col("log_index").alias("log_idx_sale_typed"),
+            ),
            left_on=["_sector", "_type_group", "_sale_year"],
            right_on=["sector", "type_group", "year"],
            how="left",
        )
        # Join "All" index at sale year
        df = df.join(
-            idx_all.select("sector", "year", pl.col("log_index").alias("log_idx_sale_all")),
+            idx_all.select(
+                "sector", "year", pl.col("log_index").alias("log_idx_sale_all")
+            ),
            left_on=["_sector", "_sale_year"],
            right_on=["sector", "year"],
            how="left",
        )
        # Join type-specific index at current year
        df = df.join(
-            idx_typed.filter(pl.col("year") == CURRENT_YEAR)
-            .select("sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")),
+            idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
+                "sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
+            ),
            left_on=["_sector", "_type_group"],
            right_on=["sector", "type_group"],
            how="left",
        )
        # Join "All" index at current year
        df = df.join(
-            idx_all.filter(pl.col("year") == CURRENT_YEAR)
-            .select("sector", pl.col("log_index").alias("log_idx_cur_all")),
+            idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
+                "sector", pl.col("log_index").alias("log_idx_cur_all")
+            ),
            left_on="_sector",
            right_on="sector",
            how="left",
        )

        df = df.with_columns(
-            pl.col("log_idx_sale_typed").fill_null(pl.col("log_idx_sale_all")).alias("_log_index_sale"),
-            pl.col("log_idx_cur_typed").fill_null(pl.col("log_idx_cur_all")).alias("_log_index_current"),
+            pl.col("log_idx_sale_typed")
+            .fill_null(pl.col("log_idx_sale_all"))
+            .alias("_log_index_sale"),
+            pl.col("log_idx_cur_typed")
+            .fill_null(pl.col("log_idx_cur_all"))
+            .alias("_log_index_current"),
        )
    else:
        df = df.join(
-            index.select("sector", "year", pl.col("log_index").alias("_log_index_sale")),
+            index.select(
+                "sector", "year", pl.col("log_index").alias("_log_index_sale")
+            ),
            left_on=["_sector", "_sale_year"],
            right_on=["sector", "year"],
            how="left",
        )
-        index_current = (
-            index.filter(pl.col("year") == CURRENT_YEAR)
-            .select("sector", pl.col("log_index").alias("_log_index_current"))
+        index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
+            "sector", pl.col("log_index").alias("_log_index_current")
        )
        df = df.join(index_current, left_on="_sector", right_on="sector", how="left")

@ -127,6 +169,224 @@ def main():
        .alias("Estimated current price"),
    )

+    n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
+    n_with_price = df.filter(has_price).height
+    print(
+        f"  {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
+    )
+
+    # Apply hedonic blending if model provided
+    if args.hedonic_model is not None:
+        print("\nApplying hedonic blending...")
+        with open(args.hedonic_model) as f:
+            model = json.load(f)
+        type_models = model["type_models"]
+        tau = model.get("tau", 15.0)
+        print(f"  tau = {tau}, {len(type_models)} type models")
+
+        # Add type_group for per-type lookup
+        df = df.with_columns(type_group_expr())
+        hedonic_mask = (
+            has_price
+            & pl.col("Estimated current price").is_not_null()
+            & pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & pl.col("type_group").is_not_null()
+        )
+        eligible = df.filter(hedonic_mask)
+
+        if len(eligible) > 0:
+            log_fa = np.log(
+                np.maximum(
+                    eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
+                    1.0,
+                )
+            )
+            sectors = eligible["_sector"].to_list()
+            types = eligible["type_group"].to_list()
+
+            # Per-type hedonic prediction
+            log_hedonic = np.empty(len(eligible))
+            for i in range(len(eligible)):
+                tm = type_models.get(types[i])
+                if tm is None:
+                    log_hedonic[i] = np.nan
+                    continue
+                alpha = tm["sector_intercepts"].get(
+                    sectors[i], tm["national_intercept"]
+                )
+                log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
+
+            valid = np.isfinite(log_hedonic)
+
+            # Hold years and blend weight
+            sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
+            hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
+            blend_w = hold_years / (hold_years + tau)
+
+            # Blend in log space
+            log_index_est = np.log(
+                eligible["Estimated current price"].to_numpy().astype(np.float64)
+            )
+            log_blended = np.where(
+                valid,
+                (1 - blend_w) * log_index_est + blend_w * log_hedonic,
+                log_index_est,
+            )
+            blended_prices = np.exp(log_blended)
+
+            # Write back into df
+            eligible_indices = df.select(hedonic_mask).to_series().arg_true()
+            price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
+            for i, idx in enumerate(eligible_indices):
+                price_arr[idx] = blended_prices[i]
+            df = df.with_columns(
+                pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
+            )
+
+            n_blended = int(valid.sum())
+            avg_w = float(np.mean(blend_w[valid]))
+            print(
+                f"  {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
+            )
+        else:
+            print("  No eligible properties for hedonic blending")
+
+    # Apply renovation premiums if provided
+    if args.renovation_premium is not None:
+        print("\nApplying renovation premiums...")
+        reno_prem = pl.read_parquet(args.renovation_premium)
+        print(f"  Loaded {len(reno_prem):,} premium rows")
+
+        # Find properties with post-sale renovation events
+        has_reno = (
+            pl.col("renovation_history").is_not_null()
+            & (pl.col("renovation_history").list.len() > 0)
+            & pl.col("Estimated current price").is_not_null()
+        )
+
+        # Explode renovation events, filter to post-sale only
+        reno_rows = (
+            df.lazy()
+            .filter(has_reno)
+            .select("_sector", "_type_group", "_sale_year", "renovation_history")
+            .with_row_index("_row_idx")
+            .explode("renovation_history")
+            .with_columns(
+                pl.col("renovation_history").struct.field("year").alias("_event_year"),
+                pl.col("renovation_history").struct.field("event").alias("_event_type"),
+            )
+            .filter(pl.col("_event_year") > pl.col("_sale_year"))
+            .collect()
+        )
+
+        if len(reno_rows) > 0:
+            # Take most recent event per (row, event_type)
+            latest = (
+                reno_rows.lazy()
+                .group_by("_row_idx", "_event_type", "_sector", "_type_group")
+                .agg(pl.col("_event_year").max().alias("_event_year"))
+                .collect()
+            )
+
+            # Compute time-decayed premium
+            latest = latest.with_columns(
+                (-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
+                .exp()
+                .alias("_decay"),
+            )
+
+            # Join with renovation_premium.parquet — try typed first, fall back to "All"
+            rp_typed = reno_prem.filter(pl.col("type_group") != "All")
+            rp_all = reno_prem.filter(pl.col("type_group") == "All")
+
+            latest = (
+                latest.join(
+                    rp_typed.select(
+                        "sector",
+                        "type_group",
+                        "event_type",
+                        pl.col("log_premium").alias("_lp_typed"),
+                    ),
+                    left_on=["_sector", "_type_group", "_event_type"],
+                    right_on=["sector", "type_group", "event_type"],
+                    how="left",
+                )
+                .join(
+                    rp_all.select(
+                        "sector", "event_type", pl.col("log_premium").alias("_lp_all")
+                    ),
+                    left_on=["_sector", "_event_type"],
+                    right_on=["sector", "event_type"],
+                    how="left",
+                )
+                .with_columns(
+                    pl.col("_lp_typed")
+                    .fill_null(pl.col("_lp_all"))
+                    .fill_null(0.0)
+                    .alias("_log_premium"),
+                )
+            )
+
+            # Compute total decayed log premium per property
+            per_property = (
+                latest.lazy()
+                .with_columns(
+                    (pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
+                )
+                .group_by("_row_idx")
+                .agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
+                .collect()
+            )
+
+            # We need to map _row_idx back to the main df. Re-derive the row indices.
+            # _row_idx was generated from filtered rows — we need the actual df row indices.
+            reno_mask = df.select(has_reno).to_series()
+            actual_indices = reno_mask.arg_true()
+
+            # Build a mapping: _row_idx -> actual df row
+            idx_map = per_property.with_columns(
+                pl.col("_row_idx")
+                .map_elements(
+                    lambda i: int(actual_indices[i]),
+                    return_dtype=pl.UInt32,
+                )
+                .alias("_df_row"),
+            )
+
+            # Create a full-length column of zeros, then fill in premium values
+            reno_log_prem = [0.0] * len(df)
+            for row in idx_map.iter_rows(named=True):
+                reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
+
+            df = df.with_columns(
+                pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
+            )
+
+            # Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
+            df = df.with_columns(
+                pl.when(pl.col("_reno_log_premium") != 0.0)
+                .then(
+                    pl.col("Estimated current price")
+                    * pl.col("_reno_log_premium").exp()
+                )
+                .otherwise(pl.col("Estimated current price"))
+                .alias("Estimated current price"),
+            )
+
+            n_with_premium = idx_map.height
+            avg_multiplier = math.exp(
+                per_property["_reno_log_premium"]
+                .filter(per_property["_reno_log_premium"] != 0.0)
+                .mean()
+            )
+            print(f"  {n_with_premium:,} properties with renovation premium applied")
+            print(
+                f"  Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
+            )
+        else:
+            print("  No properties with post-sale renovation events")
+
    # Derive estimated price per sqm where both estimated price and floor area exist
    df = df.with_columns(
        (pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
@ -135,20 +395,19 @@ def main():
        .alias("Est. price per sqm"),
    )

-    n_adjusted = df.filter(
-        has_price & pl.col("_log_index_sale").is_not_null()
-    ).height
-    n_with_price = df.filter(has_price).height
-    print(f"  {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)")
-
    # Drop all temporary columns
    temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
+    # Also drop hedonic-derived column if it was added
+    if "type_group" in df.columns:
+        temp_cols.append("type_group")
    df = df.drop(temp_cols)

    df.write_parquet(args.input)
    size_mb = args.input.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
-    print(f"  {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')")
+    print(
+        f"  {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
+    )


 if __name__ == "__main__":
--- a/pipeline/transform/price_index.py
+++ b/pipeline/transform/price_index.py
@ -19,66 +19,38 @@ from scipy.sparse.linalg import lsqr
 from scipy.spatial import KDTree
 from tqdm import tqdm

+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    SHRINKAGE_K,
+    TYPE_GROUPS,
+    build_hedonic_features,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
 # --- Constants ---
 MIN_PAIRS = 5
-SHRINKAGE_K = 50
 OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
 HUBER_K = 1.345
 IRLS_ITERATIONS = 5
 SPATIAL_NEIGHBORS = 5
 SPATIAL_BLEND_K = 30
-CURRENT_YEAR = 2025
-
-TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
-TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
-AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
-AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
-
-
-def type_group_expr():
-    """Polars expression: Property type → type_group."""
-    return (
-        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
-        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
-        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
-        .otherwise(pl.lit(None))
-        .alias("type_group")
-    )
-
-
-def age_band_expr():
-    """Polars expression: Construction age (UInt16 year) → age band string."""
-    expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
-    for i, brk in enumerate(AGE_BREAKS):
-        expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
-    return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
-
-
-def sector_expr():
-    """Polars expression: Postcode → sector (drop last 2 chars, strip)."""
-    return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
-
-
-def hierarchy_keys(sector: str) -> tuple[str, str]:
-    """Return (district, area) for a sector string."""
-    district = sector.rsplit(" ", 1)[0] if " " in sector else sector
-    area = ""
-    for ch in district:
-        if ch.isalpha():
-            area += ch
-        else:
-            break
-    return district, area


 # --- Pair extraction ---

+
 def extract_pairs(input_path: Path) -> pl.DataFrame:
    print("Extracting repeat-sale pairs...")
    df = (
        pl.scan_parquet(input_path)
        .select("Postcode", "historical_prices", "Property type")
-        .filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
        .with_columns(sector_expr(), type_group_expr())
        .collect()
    )
@ -87,7 +59,9 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
    pairs = (
        df.lazy()
        .with_columns(
-            pl.col("historical_prices").list.slice(0, pl.col("historical_prices").list.len() - 1).alias("from_txn"),
+            pl.col("historical_prices")
+            .list.slice(0, pl.col("historical_prices").list.len() - 1)
+            .alias("from_txn"),
            pl.col("historical_prices").list.slice(1).alias("to_txn"),
        )
        .explode("from_txn", "to_txn")
@ -98,10 +72,18 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
            pl.col("to_txn").struct.field("price").alias("price2"),
        )
        .select("sector", "type_group", "year1", "price1", "year2", "price2")
-        .filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
+        .filter(
+            pl.col("price1") > 0,
+            pl.col("price2") > 0,
+            pl.col("year2") > pl.col("year1"),
+        )
        .with_columns(
-            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
-            (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
+            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
+            .log()
+            .alias("log_ratio"),
+            (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
+                "weight"
+            ),
        )
        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
        .collect()
@ -118,31 +100,14 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
    return pairs


-# --- Sector centroids ---
-
-def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
-    print("Computing sector centroids...")
-    df = (
-        pl.scan_parquet(input_path)
-        .select("Postcode", "lat", "lon")
-        .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
-        .with_columns(sector_expr())
-        .group_by("sector")
-        .agg(pl.col("lat").mean(), pl.col("lon").mean())
-        .collect()
-    )
-    centroids = {}
-    for row in df.iter_rows(named=True):
-        centroids[row["sector"]] = (row["lat"], row["lon"])
-    print(f"  {len(centroids):,} sector centroids")
-    return centroids
-
-
 # --- Robust IRLS solver ---

+
 def solve_robust_index(
-    years1: np.ndarray, years2: np.ndarray,
-    log_ratios: np.ndarray, base_weights: np.ndarray,
+    years1: np.ndarray,
+    years2: np.ndarray,
+    log_ratios: np.ndarray,
+    base_weights: np.ndarray,
 ) -> dict[int, float]:
    """IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
    n = len(years1)
@ -205,11 +170,16 @@ def solve_robust_index(
 def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
    """Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
    groups = pairs.group_by(group_col).agg(
-        pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
+        pl.col("year1"),
+        pl.col("year2"),
+        pl.col("log_ratio"),
+        pl.col("weight"),
    )
    indices = {}
    n_pairs = {}
-    for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"):
+    for row in tqdm(
+        groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"
+    ):
        key = row[group_col]
        y1 = np.array(row["year1"], dtype=np.int32)
        y2 = np.array(row["year2"], dtype=np.int32)
@ -224,28 +194,28 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):

 # --- Hedonic model ---

-def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
+
+def compute_hedonic_index(
+    input_path: Path, min_year: int, max_year: int
+) -> dict[int, float]:
    """Two-step hedonic index: regress log(price) on features, average residual by year."""
    print("Computing hedonic index...")
    df = (
        pl.scan_parquet(input_path)
        .select(
-            "Last known price", "Date of last transaction", "Property type",
-            "Total floor area (sqm)", "Current energy rating",
-            "Number of bedrooms & living rooms", "Construction age",
+            "Last known price",
+            "Date of last transaction",
+            "Property type",
+            "Total floor area (sqm)",
        )
        .filter(
            pl.col("Last known price").is_not_null(),
            pl.col("Total floor area (sqm)").is_not_null(),
            pl.col("Total floor area (sqm)") > 0,
-            pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
-            pl.col("Number of bedrooms & living rooms").is_not_null(),
-            pl.col("Construction age").is_not_null(),
        )
        .with_columns(
            pl.col("Date of last transaction").dt.year().alias("sale_year"),
            type_group_expr(),
-            age_band_expr(),
        )
        .filter(
            pl.col("type_group").is_not_null(),
@ -261,29 +231,9 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
    log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
    sale_years = df["sale_year"].to_numpy()

-    # Build feature matrix
-    parts = []
-    # log(floor_area)
-    fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
-    parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
-    # Type dummies (ref: Detached)
-    tg = df["type_group"].to_numpy()
-    for t in ["Terraced", "Semi-Detached", "Flats"]:
-        parts.append((tg == t).astype(np.float32).reshape(-1, 1))
-    # EPC dummies (ref: D)
-    epc = df["Current energy rating"].to_numpy()
-    for r in ["A", "B", "C", "E", "F", "G"]:
-        parts.append((epc == r).astype(np.float32).reshape(-1, 1))
-    # Rooms
-    parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
-    # Age band dummies (ref: pre-1900)
-    ab = df["age_band"].to_numpy()
-    for band in AGE_LABELS[1:]:
-        parts.append((ab == band).astype(np.float32).reshape(-1, 1))
-    # Intercept
-    parts.append(np.ones((len(df), 1), dtype=np.float32))
-
-    F = np.hstack(parts)
+    # Build feature matrix (18 hedonic features + intercept)
+    X = build_hedonic_features(df)
+    F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
    print(f"  Feature matrix: {F.shape[0]:,} × {F.shape[1]}")

    # Step 1: regress log(price) on features → quality score
@ -303,12 +253,15 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
    for y in hedonic:
        hedonic[y] -= base

-    print(f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
+    print(
+        f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
+    )
    return hedonic


 # --- Shrinkage ---

+
 def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
    w = n_pairs / (n_pairs + k)
    result = {}
@ -320,9 +273,18 @@ def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) ->


 def apply_shrinkage(
-    sector_idx, sector_n, district_idx, district_n,
-    area_idx, area_n, national_idx, national_n,
-    hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
+    sector_idx,
+    sector_n,
+    district_idx,
+    district_n,
+    area_idx,
+    area_n,
+    national_idx,
+    national_n,
+    hedonic_idx,
+    all_sectors,
+    sector_to_dist,
+    dist_to_area,
 ):
    """Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
    # National → hedonic
@ -361,8 +323,11 @@ def apply_shrinkage(

 # --- Spatial smoothing ---

+
 def spatial_smooth(
-    sector_indices: dict, centroids: dict, n_pairs_map: dict,
+    sector_indices: dict,
+    centroids: dict,
+    n_pairs_map: dict,
 ) -> dict:
    """Blend sparse sector indices with K nearest neighbors."""
    # Build coordinate arrays for sectors with centroids
@ -420,6 +385,7 @@ def spatial_smooth(

 # --- Forward fill ---

+
 def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
    filled = {}
    last = 0.0
@ -432,8 +398,11 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:

 # --- Main ---

+
 def main():
-    parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
+    parser = argparse.ArgumentParser(
+        description="Build improved repeat-sales price index"
+    )
    parser.add_argument("--input", type=Path, required=True)
    parser.add_argument("--output", type=Path, required=True)
    args = parser.parse_args()
@ -474,8 +443,10 @@ def main():
        # National
        np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
        national_idx = solve_robust_index(
-            np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
-            np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
+            np_arrs["year1"].to_numpy(),
+            np_arrs["year2"].to_numpy(),
+            np_arrs["log_ratio"].to_numpy(),
+            np_arrs["weight"].to_numpy(),
        )
        national_n = len(typed)
        print(f"  National: {len(national_idx)} years")
@ -485,14 +456,25 @@ def main():
        area_idx, area_n = compute_indices_for_level(typed, "area")
        district_idx, district_n = compute_indices_for_level(typed, "district")
        sector_idx, sector_n = compute_indices_for_level(typed, "sector")
-        print(f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
+        print(
+            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
+        )

        # Shrinkage
        print("  Applying shrinkage...")
        sector_shrunk = apply_shrinkage(
-            sector_idx, sector_n, district_idx, district_n,
-            area_idx, area_n, national_idx, national_n,
-            hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
+            sector_idx,
+            sector_n,
+            district_idx,
+            district_n,
+            area_idx,
+            area_n,
+            national_idx,
+            national_n,
+            hedonic_idx,
+            all_sectors,
+            sector_to_dist,
+            dist_to_area,
        )

        # Spatial smoothing
@ -519,15 +501,22 @@ def main():

    result = pl.DataFrame(
        rows,
-        schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
-                "log_index": pl.Float64, "n_pairs": pl.Int64},
+        schema={
+            "sector": pl.String,
+            "type_group": pl.String,
+            "year": pl.Int32,
+            "log_index": pl.Float64,
+            "n_pairs": pl.Int64,
+        },
        orient="row",
    ).sort("type_group", "sector", "year")

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
-    print(f"  {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
+    print(
+        f"  {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
+    )


 if __name__ == "__main__":
--- a/pipeline/transform/renovation_premium.py
+++ b/pipeline/transform/renovation_premium.py
@ -0,0 +1,572 @@
+"""Estimate per-area renovation premiums from repeat-sale residuals.
+
+For each repeat-sale pair, computes the residual after removing the price-index
+predicted return. Pairs where renovation events occurred between sales should have
+systematically higher residuals. A WLS regression estimates the log-premium per
+event type, with hierarchical shrinkage and spatial smoothing.
+
+Output: renovation_premium.parquet — sector × type_group × event_type → log_premium
+"""
+
+import argparse
+import math
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import KDTree
+
+from pipeline.transform._price_utils import (
+    SHRINKAGE_K,
+    TYPE_GROUPS,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
+HALF_LIFE = 10.0
+DECAY_RATE = math.log(2) / HALF_LIFE
+OUTLIER_THRESHOLD = 3.0
+MIN_PAIRS = 10
+SPATIAL_NEIGHBORS = 5
+SPATIAL_BLEND_K = 30
+EVENT_TYPES = ["Extension", "Renovation", "Remodeling"]
+
+
+def extract_pairs_with_events(input_path: Path, index_path: Path) -> pl.DataFrame:
+    """Extract repeat-sale pairs with renovation events and index residuals."""
+    print("Extracting repeat-sale pairs with renovation events...")
+
+    df = (
+        pl.scan_parquet(input_path)
+        .select("Postcode", "historical_prices", "Property type", "renovation_history")
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
+        .with_columns(sector_expr(), type_group_expr())
+        .collect()
+    )
+    print(f"  {len(df):,} properties with 2+ transactions")
+
+    # Build consecutive pairs
+    pairs = (
+        df.lazy()
+        .with_columns(
+            pl.col("historical_prices")
+            .list.slice(0, pl.col("historical_prices").list.len() - 1)
+            .alias("from_txn"),
+            pl.col("historical_prices").list.slice(1).alias("to_txn"),
+        )
+        .explode("from_txn", "to_txn")
+        .with_columns(
+            pl.col("from_txn").struct.field("year").alias("year1"),
+            pl.col("from_txn").struct.field("price").alias("price1"),
+            pl.col("to_txn").struct.field("year").alias("year2"),
+            pl.col("to_txn").struct.field("price").alias("price2"),
+        )
+        .select(
+            "sector",
+            "type_group",
+            "year1",
+            "price1",
+            "year2",
+            "price2",
+            "renovation_history",
+        )
+        .filter(
+            pl.col("price1") > 0,
+            pl.col("price2") > 0,
+            pl.col("year2") > pl.col("year1"),
+        )
+        .with_columns(
+            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
+            .log()
+            .alias("log_ratio"),
+        )
+        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
+        .collect()
+    )
+    print(f"  {len(pairs):,} repeat-sale pairs")
+
+    # Join price index to compute residuals
+    index = pl.read_parquet(index_path)
+    has_type_group = "type_group" in index.columns
+
+    if has_type_group:
+        idx_typed = index.filter(pl.col("type_group") != "All")
+        idx_all = index.filter(pl.col("type_group") == "All")
+
+        # Join at year1
+        pairs = pairs.join(
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li1_typed")
+            ),
+            left_on=["sector", "type_group", "year1"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        ).join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li1_all")),
+            left_on=["sector", "year1"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+        # Join at year2
+        pairs = pairs.join(
+            idx_typed.select(
+                "sector", "type_group", "year", pl.col("log_index").alias("li2_typed")
+            ),
+            left_on=["sector", "type_group", "year2"],
+            right_on=["sector", "type_group", "year"],
+            how="left",
+        ).join(
+            idx_all.select("sector", "year", pl.col("log_index").alias("li2_all")),
+            left_on=["sector", "year2"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+
+        pairs = pairs.with_columns(
+            (pl.col("li1_typed").fill_null(pl.col("li1_all"))).alias("_li1"),
+            (pl.col("li2_typed").fill_null(pl.col("li2_all"))).alias("_li2"),
+        )
+    else:
+        pairs = pairs.join(
+            index.select("sector", "year", pl.col("log_index").alias("_li1")),
+            left_on=["sector", "year1"],
+            right_on=["sector", "year"],
+            how="left",
+        ).join(
+            index.select("sector", "year", pl.col("log_index").alias("_li2")),
+            left_on=["sector", "year2"],
+            right_on=["sector", "year"],
+            how="left",
+        )
+
+    # Compute residual = log_ratio - (index2 - index1)
+    pairs = pairs.with_columns(
+        (
+            pl.col("log_ratio")
+            - (pl.col("_li2").fill_null(0.0) - pl.col("_li1").fill_null(0.0))
+        ).alias("residual"),
+        (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
+            "weight"
+        ),
+    )
+
+    # For each pair, compute time-decayed renovation indicators
+    # Use row index for unique identification (composite keys aren't unique per pair)
+    pairs = pairs.with_row_index("_pair_idx")
+
+    for et in EVENT_TYPES:
+        col_name = f"has_{et.lower()}"
+        pairs = pairs.with_columns(pl.lit(0.0).alias(col_name))
+
+    # Process properties that have renovation history
+    has_reno = pairs.filter(
+        pl.col("renovation_history").is_not_null()
+        & (pl.col("renovation_history").list.len() > 0)
+    )
+
+    if len(has_reno) > 0:
+        reno_exploded = (
+            has_reno.select("_pair_idx", "year1", "year2", "renovation_history")
+            .explode("renovation_history")
+            .with_columns(
+                pl.col("renovation_history").struct.field("year").alias("event_year"),
+                pl.col("renovation_history").struct.field("event").alias("event_type"),
+            )
+            # Only events between the two sales
+            .filter(
+                (pl.col("event_year") > pl.col("year1"))
+                & (pl.col("event_year") <= pl.col("year2"))
+            )
+        )
+
+        if len(reno_exploded) > 0:
+            # For each pair + event type, take the most recent event
+            latest_events = reno_exploded.group_by(
+                "_pair_idx", "event_type", "year2"
+            ).agg(pl.col("event_year").max().alias("latest_event_year"))
+
+            # Compute time-decayed indicator: exp(-decay_rate * (year2 - event_year))
+            latest_events = latest_events.with_columns(
+                (
+                    -DECAY_RATE
+                    * (pl.col("year2") - pl.col("latest_event_year")).cast(pl.Float64)
+                )
+                .exp()
+                .alias("decayed_indicator"),
+            )
+
+            # Pivot to wide format using _pair_idx for unique join
+            for et in EVENT_TYPES:
+                et_data = latest_events.filter(pl.col("event_type") == et)
+                if len(et_data) > 0:
+                    col_name = f"has_{et.lower()}"
+                    pairs = (
+                        pairs.join(
+                            et_data.select(
+                                "_pair_idx",
+                                pl.col("decayed_indicator").alias(f"_{col_name}"),
+                            ),
+                            on="_pair_idx",
+                            how="left",
+                        )
+                        .with_columns(
+                            pl.col(f"_{col_name}").fill_null(0.0).alias(col_name),
+                        )
+                        .drop(f"_{col_name}")
+                    )
+
+    pairs = pairs.drop("_pair_idx")
+
+    # Add hierarchy columns
+    pairs = pairs.with_columns(
+        pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
+    ).with_columns(
+        pl.col("district").str.replace(r"\d.*$", "").alias("area"),
+    )
+
+    # Count reno pairs
+    reno_mask = (
+        (pl.col("has_extension") > 0)
+        | (pl.col("has_renovation") > 0)
+        | (pl.col("has_remodeling") > 0)
+    )
+    n_reno = pairs.filter(reno_mask).height
+    print(
+        f"  {n_reno:,} pairs with renovation events ({n_reno / len(pairs) * 100:.1f}%)"
+    )
+
+    # Drop temporary columns from index join + renovation_history (no longer needed)
+    temp_cols = [
+        c
+        for c in pairs.columns
+        if c.startswith("_li") or c.startswith("li1_") or c.startswith("li2_")
+    ]
+    pairs = pairs.drop(temp_cols + ["renovation_history"])
+
+    return pairs
+
+
+def wls_regression(
+    residuals: np.ndarray,
+    weights: np.ndarray,
+    X: np.ndarray,
+) -> np.ndarray:
+    """Weighted least squares: residual ~ X (with intercept column in X).
+
+    Uses sqrt(weights) scaling to avoid building a full N×N diagonal matrix.
+    """
+    sqrt_w = np.sqrt(weights)[:, np.newaxis]
+    Xw = X * sqrt_w
+    yw = residuals * sqrt_w.ravel()
+    try:
+        betas = np.linalg.lstsq(Xw, yw, rcond=None)[0]
+    except np.linalg.LinAlgError:
+        betas = np.zeros(X.shape[1])
+    return betas
+
+
+def compute_premiums_for_group(df: pl.DataFrame) -> dict[str, float]:
+    """Run WLS regression for a group, return {event_type: log_premium}."""
+    n = len(df)
+    if n < MIN_PAIRS:
+        return {}
+
+    residuals = df["residual"].to_numpy().astype(np.float64)
+    weights = df["weight"].to_numpy().astype(np.float64)
+
+    # Build design matrix: intercept + 3 event indicators
+    X = np.column_stack(
+        [
+            np.ones(n),
+            df["has_extension"].to_numpy().astype(np.float64),
+            df["has_renovation"].to_numpy().astype(np.float64),
+            df["has_remodeling"].to_numpy().astype(np.float64),
+        ]
+    )
+
+    # Check if we have any renovation pairs in this group
+    reno_sum = X[:, 1:].sum()
+    if reno_sum < 1.0:
+        return {}
+
+    betas = wls_regression(residuals, weights, X)
+    # betas[0] is intercept, betas[1:4] are the premiums
+    return {
+        "Extension": float(betas[1]),
+        "Renovation": float(betas[2]),
+        "Remodeling": float(betas[3]),
+    }
+
+
+def compute_premiums_for_level(
+    pairs: pl.DataFrame, group_col: str
+) -> tuple[dict, dict]:
+    """Compute premiums per group at a given hierarchy level.
+
+    Returns (premiums, n_reno_pairs) dicts keyed by group value.
+    premiums[key] = {event_type: log_premium}
+    """
+    groups = pairs.group_by(group_col)
+    premiums = {}
+    n_reno_pairs = {}
+    for key, group_df in groups:
+        key_val = key[0]
+        result = compute_premiums_for_group(group_df)
+        if result:
+            premiums[key_val] = result
+            # Count pairs with any reno indicator
+            reno_mask = (
+                (group_df["has_extension"].to_numpy() > 0)
+                | (group_df["has_renovation"].to_numpy() > 0)
+                | (group_df["has_remodeling"].to_numpy() > 0)
+            )
+            n_reno_pairs[key_val] = int(reno_mask.sum())
+    return premiums, n_reno_pairs
+
+
+def shrink_premium(
+    raw: dict[str, float], parent: dict[str, float], n: int
+) -> dict[str, float]:
+    """Shrink raw premiums toward parent level."""
+    w = n / (n + SHRINKAGE_K)
+    result = {}
+    for et in EVENT_TYPES:
+        r = raw.get(et, parent.get(et, 0.0))
+        p = parent.get(et, raw.get(et, 0.0))
+        result[et] = w * r + (1 - w) * p
+    return result
+
+
+def apply_shrinkage(
+    sector_prem,
+    sector_n,
+    district_prem,
+    district_n,
+    area_prem,
+    area_n,
+    national_prem,
+    national_n,
+    all_sectors,
+    sector_to_dist,
+    dist_to_area,
+):
+    """Top-down hierarchical shrinkage for premiums."""
+    # Area -> national
+    area_shrunk = {}
+    for area, prem in area_prem.items():
+        area_shrunk[area] = shrink_premium(prem, national_prem, area_n.get(area, 0))
+
+    # District -> area
+    district_shrunk = {}
+    for dist, prem in district_prem.items():
+        a = dist_to_area.get(dist, "")
+        parent = area_shrunk.get(a, national_prem)
+        district_shrunk[dist] = shrink_premium(prem, parent, district_n.get(dist, 0))
+
+    # Sector -> district
+    sector_shrunk = {}
+    for sec, prem in sector_prem.items():
+        d = sector_to_dist.get(sec, "")
+        parent = district_shrunk.get(d, national_prem)
+        sector_shrunk[sec] = shrink_premium(prem, parent, sector_n.get(sec, 0))
+
+    # Fill missing sectors
+    for sec in all_sectors:
+        if sec not in sector_shrunk:
+            d = sector_to_dist.get(sec, "")
+            a = dist_to_area.get(d, "")
+            sector_shrunk[sec] = district_shrunk.get(
+                d, area_shrunk.get(a, national_prem)
+            )
+
+    return sector_shrunk
+
+
+def spatial_smooth(
+    sector_premiums: dict[str, dict[str, float]],
+    centroids: dict[str, tuple[float, float]],
+    n_reno_map: dict[str, int],
+) -> dict[str, dict[str, float]]:
+    """Blend sparse sector premiums with K nearest neighbors."""
+    sectors_with_coords = [s for s in sector_premiums if s in centroids]
+    if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
+        return sector_premiums
+
+    coords = np.array([centroids[s] for s in sectors_with_coords])
+    mean_lat = np.mean(coords[:, 0])
+    scale = np.cos(np.radians(mean_lat))
+    scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
+    tree = KDTree(scaled_coords)
+
+    result = dict(sector_premiums)
+    for i, sec in enumerate(sectors_with_coords):
+        n = n_reno_map.get(sec, 0)
+        self_w = n / (n + SPATIAL_BLEND_K)
+        if self_w > 0.95:
+            continue
+
+        dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
+        neighbor_dists = dists[1:]
+        neighbor_idxs = idxs[1:]
+
+        inv_dists = []
+        neighbor_prems = []
+        for d, j in zip(neighbor_dists, neighbor_idxs):
+            ns = sectors_with_coords[j]
+            if d > 0 and ns in sector_premiums:
+                inv_dists.append(1.0 / d)
+                neighbor_prems.append(sector_premiums[ns])
+
+        if not neighbor_prems:
+            continue
+
+        total_inv = sum(inv_dists)
+        nbr_w = 1.0 - self_w
+        ws = [iw / total_inv * nbr_w for iw in inv_dists]
+
+        blended = {}
+        for et in EVENT_TYPES:
+            val = self_w * sector_premiums[sec].get(et, 0.0)
+            for np_dict, w in zip(neighbor_prems, ws):
+                val += w * np_dict.get(et, 0.0)
+            blended[et] = val
+        result[sec] = blended
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Estimate renovation premiums from repeat-sale residuals"
+    )
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Path to wide.parquet"
+    )
+    parser.add_argument(
+        "--index", type=Path, required=True, help="Path to price_index.parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output renovation_premium.parquet"
+    )
+    args = parser.parse_args()
+
+    pairs = extract_pairs_with_events(args.input, args.index)
+    centroids = extract_centroids(args.input)
+
+    # Precompute hierarchy
+    all_sectors = pairs["sector"].unique().to_list()
+    sector_to_dist = {}
+    dist_to_area = {}
+    for s in all_sectors:
+        d, a = hierarchy_keys(s)
+        sector_to_dist[s] = d
+        dist_to_area[d] = a
+
+    all_type_groups = ["All"] + TYPE_GROUPS
+    rows = []
+
+    for tg in all_type_groups:
+        print(f"\n--- {tg} ---")
+        typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
+        if len(typed) < MIN_PAIRS:
+            print(f"  Skipping (only {len(typed)} pairs)")
+            continue
+
+        print(f"  {len(typed):,} pairs")
+
+        # National
+        national_prem = compute_premiums_for_group(typed)
+        national_reno = typed.filter(
+            (pl.col("has_extension") > 0)
+            | (pl.col("has_renovation") > 0)
+            | (pl.col("has_remodeling") > 0)
+        ).height
+        if not national_prem:
+            print("  No renovation pairs at national level, skipping")
+            continue
+
+        print(
+            "  National premiums: "
+            + ", ".join(
+                f"{et}: {v:.4f} ({math.exp(v) - 1:.1%})"
+                for et, v in national_prem.items()
+            )
+        )
+
+        # Per-level
+        print("  Computing per-level premiums:")
+        area_prem, area_n = compute_premiums_for_level(typed, "area")
+        district_prem, district_n = compute_premiums_for_level(typed, "district")
+        sector_prem, sector_n = compute_premiums_for_level(typed, "sector")
+        print(
+            f"  {len(area_prem)} areas, {len(district_prem)} districts, {len(sector_prem)} sectors with data"
+        )
+
+        # Shrinkage
+        print("  Applying shrinkage...")
+        sector_shrunk = apply_shrinkage(
+            sector_prem,
+            sector_n,
+            district_prem,
+            district_n,
+            area_prem,
+            area_n,
+            national_prem,
+            national_reno,
+            all_sectors,
+            sector_to_dist,
+            dist_to_area,
+        )
+
+        # Spatial smoothing
+        print("  Spatial smoothing...")
+        sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
+
+        # Collect rows
+        for sec in all_sectors:
+            prem = sector_smoothed.get(sec, national_prem)
+            n = sector_n.get(sec, 0)
+            for et in EVENT_TYPES:
+                rows.append((sec, tg, et, prem.get(et, 0.0), n))
+
+    result = pl.DataFrame(
+        rows,
+        schema={
+            "sector": pl.String,
+            "type_group": pl.String,
+            "event_type": pl.String,
+            "log_premium": pl.Float64,
+            "n_reno_pairs": pl.Int64,
+        },
+        orient="row",
+    ).sort("type_group", "sector", "event_type")
+
+    result.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
+    print(
+        f"  {result['sector'].n_unique():,} sectors x {len(all_type_groups)} types x {len(EVENT_TYPES)} events = {len(result):,} rows"
+    )
+
+    # Print summary statistics
+    print("\nNational premium summary:")
+    national = (
+        result.filter(pl.col("type_group") == "All")
+        .group_by("event_type")
+        .agg(
+            pl.col("log_premium").mean().alias("mean_log_premium"),
+        )
+    )
+    for row in national.iter_rows(named=True):
+        et = row["event_type"]
+        lp = row["mean_log_premium"]
+        print(f"  {et}: log_premium={lp:.4f} ({math.exp(lp) - 1:.1%} price uplift)")
+
+
+if __name__ == "__main__":
+    main()