This commit is contained in:
Andras Schmelczer 2026-02-14 12:53:29 +00:00
parent 3a3f899ea2
commit 128b3191e7
68 changed files with 28060 additions and 1152 deletions

View file

@ -1,6 +1,7 @@
"""Extract place=* nodes from OSM PBF → data/places.parquet.
"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.
Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
(tube, national rail, DLR, etc.) for typeahead search.
Reuses the same great-britain-latest.osm.pbf as pois.py.
"""
@ -18,13 +19,54 @@ PLACE_TYPES = {
"borough",
"town",
"suburb",
"quarter",
"neighbourhood",
"village",
"hamlet",
"locality",
"island",
"isolated_dwelling",
}
# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
" tube station",
" underground station",
" railway station",
" dlr station",
" overground station",
" tram stop",
" station",
)
def _station_display_name(name: str, tags: dict[str, str]) -> str:
"""Build a descriptive station name like 'Bank tube station'."""
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
if station_tag == "subway" or "underground" in network:
suffix = "tube station"
elif "docklands" in network or "dlr" in network:
suffix = "DLR station"
elif "overground" in network:
suffix = "overground station"
elif "elizabeth" in network:
suffix = "Elizabeth line station"
elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
suffix = "tram stop"
else:
suffix = "railway station"
# Strip any existing station suffix from the raw name
lower = name.lower()
for s in _STATION_STRIP:
if lower.endswith(s):
name = name[: len(name) - len(s)].rstrip()
break
return f"{name} {suffix}"
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm) -> None:
@ -32,6 +74,12 @@ class PlaceHandler(osmium.SimpleHandler):
self._progress = progress
self.places: list[dict] = []
def _add(self, name: str, place_type: str, lat: float, lon: float, population: int) -> None:
self.places.append(
{"name": name, "place_type": place_type, "lat": lat, "lon": lon, "population": population}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
def node(self, n: osmium.osm.Node) -> None:
self._progress.update(1)
if not n.location.valid:
@ -39,16 +87,28 @@ class PlaceHandler(osmium.SimpleHandler):
lat, lon = n.location.lat, n.location.lon
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
return
place_type = n.tags.get("place")
if place_type not in PLACE_TYPES:
return
name = n.tags.get("name:en", n.tags.get("name", ""))
if not name:
return
self.places.append(
{"name": name, "place_type": place_type, "lat": lat, "lon": lon}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
pop_str = n.tags.get("population", "")
try:
population = int(pop_str)
except ValueError:
population = 0
# place=* nodes (cities, towns, suburbs, etc.)
place_type = n.tags.get("place")
if place_type in PLACE_TYPES:
self._add(name, place_type, lat, lon, population)
return
# railway=station nodes (tube, national rail, DLR, tram, etc.)
if n.tags.get("railway") == "station":
display_name = _station_display_name(name, dict(n.tags))
self._add(display_name, "station", lat, lon, population)
return
def main() -> None:
@ -73,7 +133,7 @@ def main() -> None:
else:
print(f"Using cached PBF: {pbf_file}")
print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
with tqdm(
unit=" elements",
unit_scale=True,

View file

@ -0,0 +1,121 @@
"""Shared utilities for price index, price estimate, and renovation premium scripts."""
import numpy as np
import polars as pl
CURRENT_YEAR = 2025
TERRACE_TYPES = [
"Mid-Terrace",
"End-Terrace",
"Enclosed Mid-Terrace",
"Enclosed End-Terrace",
"Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
SHRINKAGE_K = 50
def type_group_expr():
"""Polars expression: Property type -> type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
.then(pl.lit("Terraced"))
.when(pl.col("Property type").is_in(FLAT_TYPES))
.then(pl.lit("Flats"))
.when(pl.col("Property type") == "Bungalow")
.then(pl.lit("Bungalow"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
.then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def sector_expr():
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
return (
pl.col("Postcode")
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
.str.strip_chars()
.alias("sector")
)
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
AGE_LABELS = [
"pre-1900",
"1900-1929",
"1930-1949",
"1950-1966",
"1967-1982",
"1983-1999",
"2000-2009",
"2010+",
]
HEDONIC_COLUMNS = [
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
"Postcode",
]
def age_band_expr():
"""Polars expression: Construction age (UInt16 year) → age band string."""
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
for i, brk in enumerate(AGE_BREAKS):
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
"""Build hedonic feature matrix from a DataFrame with type_group column.
Columns (5 total): log(floor_area), 4 type dummies (ref: Detached).
Sector fixed effects do the heavy lifting additional property features
(EPC, rooms, age) add no predictive value after sector demeaning.
"""
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
tg = df["type_group"].to_numpy()
parts = [log_fa]
for t in NON_REF_TYPES:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
return np.hstack(parts)
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
"""Compute mean lat/lon per postcode sector."""
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids

View file

@ -0,0 +1,300 @@
"""Cross-Sectional Hedonic Model (Per-Type)
Trains separate OLS models per property type on recent sales (last 5 years)
with sector fixed effects via Frisch-Waugh-Lovell demeaning:
log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
Each type gets its own floor area elasticity and sector intercepts, capturing
that detached houses (beta=0.74) have higher price sensitivity to size than
terraced houses (beta=0.60), and a sector's value differs by property type.
Sector intercepts are hierarchically shrunk (sector district area national)
and spatially smoothed via KD-tree nearest neighbors.
Output: hedonic_model.json with per-type betas and sector intercepts.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
SHRINKAGE_K,
TYPE_GROUPS,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
TRAINING_YEARS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def load_training_data(input_path: Path) -> pl.DataFrame:
"""Load recent sales with complete hedonic features."""
min_year = CURRENT_YEAR - TRAINING_YEARS
print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
df = (
pl.scan_parquet(input_path)
.select(*HEDONIC_COLUMNS)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Postcode").is_not_null(),
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
sector_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= CURRENT_YEAR,
)
.collect()
)
print(f" {len(df):,} complete cases")
return df
def train_type_model(
df: pl.DataFrame, type_group: str
) -> tuple[float, dict[str, float], dict[str, int], float]:
"""Train hedonic model for a single property type.
Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
"""
t_df = df.filter(pl.col("type_group") == type_group)
y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
log_fa = np.log(
np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
)
X = log_fa.reshape(-1, 1)
sectors = t_df["sector"].to_list()
# Group by sector for demeaning
sector_indices: dict[str, list[int]] = {}
for i, s in enumerate(sectors):
sector_indices.setdefault(s, []).append(i)
# Compute sector means and demean
X_demeaned = np.empty_like(X)
y_demeaned = np.empty_like(y)
sector_X_means: dict[str, np.ndarray] = {}
sector_y_means: dict[str, float] = {}
sector_counts: dict[str, int] = {}
for s, idxs in sector_indices.items():
idx = np.array(idxs)
X_mean = X[idx].mean(axis=0)
y_mean = y[idx].mean()
sector_X_means[s] = X_mean
sector_y_means[s] = y_mean
X_demeaned[idx] = X[idx] - X_mean
y_demeaned[idx] = y[idx] - y_mean
sector_counts[s] = len(idxs)
# OLS on demeaned data
beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
beta_fa = float(beta[0])
# Recover sector intercepts
sector_intercepts = {}
for s in sector_indices:
sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
national_intercept = float(np.mean(list(sector_intercepts.values())))
# R-squared
y_pred = X[:, 0] * beta_fa
for i, s in enumerate(sectors):
y_pred[i] += sector_intercepts[s]
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2 = 1 - ss_res / ss_tot
print(
f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} "
f"R²={r2:.4f} sectors={len(sector_intercepts):,}"
)
return beta_fa, sector_intercepts, sector_counts, national_intercept
def shrink_intercepts(
sector_intercepts: dict[str, float],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Hierarchical shrinkage: sector -> district -> area -> national."""
national = float(np.mean(list(sector_intercepts.values())))
sector_to_dist: dict[str, str] = {}
dist_to_area: dict[str, str] = {}
for s in sector_intercepts:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Area-level intercepts (weighted mean of sectors in area)
area_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
a = dist_to_area[d]
area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
area_intercepts: dict[str, float] = {}
area_counts: dict[str, int] = {}
for a, entries in area_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
area_intercepts[a] = sum(v * n for v, n in entries) / total_n
else:
area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
area_counts[a] = total_n
# District-level intercepts
dist_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
dist_intercepts: dict[str, float] = {}
dist_counts: dict[str, int] = {}
for d, entries in dist_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
else:
dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
dist_counts[d] = total_n
# Shrink: area -> national
area_shrunk: dict[str, float] = {}
for a, val in area_intercepts.items():
n = area_counts[a]
w = n / (n + SHRINKAGE_K)
area_shrunk[a] = w * val + (1 - w) * national
# Shrink: district -> area
dist_shrunk: dict[str, float] = {}
for d, val in dist_intercepts.items():
a = dist_to_area[d]
parent = area_shrunk.get(a, national)
n = dist_counts[d]
w = n / (n + SHRINKAGE_K)
dist_shrunk[d] = w * val + (1 - w) * parent
# Shrink: sector -> district
result: dict[str, float] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
parent = dist_shrunk.get(d, national)
n = sector_counts.get(s, 0)
w = n / (n + SHRINKAGE_K)
result[s] = w * val + (1 - w) * parent
return result
def spatial_smooth_intercepts(
sector_intercepts: dict[str, float],
centroids: dict[str, tuple[float, float]],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Blend sparse sector intercepts with K nearest neighbors."""
sectors_with_coords = [s for s in sector_intercepts if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_intercepts
coords = np.array([centroids[s] for s in sectors_with_coords])
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_intercepts)
for i, sec in enumerate(sectors_with_coords):
n = sector_counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_vals = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_intercepts:
inv_dists.append(1.0 / d)
neighbor_vals.append(sector_intercepts[ns])
if not neighbor_vals:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
blended = self_w * sector_intercepts[sec]
for val, iw in zip(neighbor_vals, inv_dists):
blended += nbr_w * (iw / total_inv) * val
result[sec] = blended
return result
def main():
parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output hedonic_model.json"
)
args = parser.parse_args()
df = load_training_data(args.input)
centroids = extract_centroids(args.input)
print("\nTraining per-type models...")
type_models = {}
total_sectors = 0
for tg in TYPE_GROUPS:
beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
shrunk = shrink_intercepts(raw_intercepts, sector_counts)
smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
total_sectors += len(smoothed)
type_models[tg] = {
"beta_fa": beta_fa,
"sector_intercepts": smoothed,
"national_intercept": national,
}
# Output
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
json.dump({"type_models": type_models}, f, indent=2)
size_kb = args.output.stat().st_size / 1024
print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
if __name__ == "__main__":
main()

View file

@ -223,7 +223,6 @@ def _build_wide(
)
.drop(
"inspection_date",
"floor_height",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
@ -276,6 +275,7 @@ def _build_wide(
"shrink_swell_risk": "Shrink-swell risk",
"soluble_rocks_risk": "Soluble rocks risk",
"median_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
}
)
)

View file

@ -9,45 +9,60 @@ Output: backtest_results.parquet with predictions vs actuals.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
CURRENT_YEAR = 2025
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
sector_expr,
type_group_expr,
)
TEST_YEAR_MIN = 2022
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
def type_group_expr():
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def extract_test_set(input_path: Path) -> pl.DataFrame:
def extract_test_set(
input_path: Path, include_hedonic_cols: bool = False
) -> pl.DataFrame:
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
print("Loading test set...")
cols = ["Postcode", "historical_prices", "Property type"]
if include_hedonic_cols:
for c in HEDONIC_COLUMNS:
if c not in cols:
cols.append(c)
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type")
.select(cols)
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
sector_expr(),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
pl.col("historical_prices")
.list.last()
.struct.field("year")
.alias("actual_year"),
pl.col("historical_prices")
.list.last()
.struct.field("price")
.alias("actual_price"),
# Second-to-last sale (input)
pl.col("historical_prices").list.get(-2).struct.field("year").alias("input_year"),
pl.col("historical_prices").list.get(-2).struct.field("price").alias("input_price"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("year")
.alias("input_year"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("price")
.alias("input_price"),
)
.filter(
pl.col("actual_year") >= TEST_YEAR_MIN,
@ -71,7 +86,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
# Join type-specific index at input year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
),
left_on=["sector", "type_group", "input_year"],
right_on=["sector", "type_group", "year"],
how="left",
@ -85,7 +102,12 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
)
# Join type-specific index at actual year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("li_act_typed"),
),
left_on=["sector", "type_group", "actual_year"],
right_on=["sector", "type_group", "year"],
how="left",
@ -99,19 +121,27 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
)
test = test.with_columns(
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
pl.col("li_in_typed")
.fill_null(pl.col("li_in_all"))
.alias("log_index_input"),
pl.col("li_act_typed")
.fill_null(pl.col("li_act_all"))
.alias("log_index_actual"),
)
else:
# Unstratified index
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
index.select(
"sector", "year", pl.col("log_index").alias("log_index_input")
),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
index.select(
"sector", "year", pl.col("log_index").alias("log_index_actual")
),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
@ -121,7 +151,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
(
pl.col("input_price").cast(pl.Float64)
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
).fill_null(pl.col("input_price").cast(pl.Float64)).alias("predicted"),
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
)
return test
@ -150,7 +182,15 @@ def print_metrics_table(metrics_by_stage: dict):
print("BACKTEST RESULTS")
print("=" * 55)
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
metric_names = [
"MdAPE (%)",
"% within 10%",
"% within 20%",
"% within 30%",
"MAE (£)",
"Mean signed error (£)",
"n",
]
stages = list(metrics_by_stage.keys())
header = f"{'Metric':<25s}"
@ -176,20 +216,37 @@ def print_metrics_table(metrics_by_stage: dict):
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
parser.add_argument("--output", type=Path, required=True, help="Output backtest_results.parquet")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups")
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
)
test = extract_test_set(args.input)
has_hedonic = args.hedonic_model is not None
test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)
print("\nPredicting with price index...")
test = predict(test, index)
@ -197,19 +254,126 @@ def main():
# Compute and print metrics
actual = test["actual_price"].to_numpy().astype(np.float64)
metrics = {
"Naive": compute_metrics(actual, test["input_price"].to_numpy().astype(np.float64)),
"Index": compute_metrics(actual, test["predicted"].to_numpy().astype(np.float64)),
"Naive": compute_metrics(
actual, test["input_price"].to_numpy().astype(np.float64)
),
"Index": compute_metrics(
actual, test["predicted"].to_numpy().astype(np.float64)
),
}
# Hedonic blending
if has_hedonic:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
# Identify eligible rows for hedonic estimate
hedonic_mask = (
pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible_mask = test.select(hedonic_mask).to_series()
eligible = test.filter(eligible_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years: input_year to actual_year (simulating real prediction)
input_years = eligible["input_year"].to_numpy().astype(np.float64)
actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(actual_years - input_years, 0.0)
log_index_pred = np.log(
np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
)
# Sweep tau values (only on valid hedonic rows)
tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
best_tau = 15.0
best_mdape = float("inf")
print(f"\n tau sweep ({valid.sum():,} eligible properties):")
for tau in tau_values:
blend_w = hold_years / (hold_years + tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended = np.exp(log_blended)
m = compute_metrics(actual_eligible, blended)
marker = ""
if m["MdAPE (%)"] < best_mdape:
best_mdape = m["MdAPE (%)"]
best_tau = tau
marker = " <-- best"
print(
f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
f"within 10%={m['% within 10%']:>5.1f}%{marker}"
)
print(f"\n Best tau = {best_tau}")
# Compute blended predictions with best tau for full test set
blend_w = hold_years / (hold_years + best_tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended_eligible = np.exp(log_blended)
# Merge back: for non-eligible rows, use index prediction
blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
eligible_indices = eligible_mask.arg_true()
for i, idx in enumerate(eligible_indices):
blended_all[idx] = blended_eligible[i]
test = test.with_columns(
pl.Series("blended", blended_all, dtype=pl.Float64),
)
metrics["Blended"] = compute_metrics(actual, blended_all)
print_metrics_table(metrics)
# Save results
result = test.select(
"Postcode", "sector",
"input_year", "input_price",
"actual_year", "actual_price",
result_cols = [
"Postcode",
"sector",
"input_year",
"input_price",
"actual_year",
"actual_price",
"predicted",
)
]
if "blended" in test.columns:
result_cols.append("blended")
result = test.select(result_cols)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -4,32 +4,56 @@ Joins the precomputed repeat-sales price index (from price_index.py) with each
property's last known sale to produce an inflation-adjusted current price estimate.
Uses type-stratified index when available, falling back to "All" type.
Optionally applies renovation premiums from renovation_premium.py: for properties
with post-sale renovation events, the estimated price is adjusted upward based on
data-driven per-area premiums with time decay.
Modifies wide.parquet in-place, adding the "Estimated current price" column.
"""
import argparse
import json
import math
from pathlib import Path
import numpy as np
import polars as pl
CURRENT_YEAR = 2025
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
from pipeline.transform._price_utils import (
CURRENT_YEAR,
sector_expr,
type_group_expr,
)
def type_group_expr():
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE
def main():
parser = argparse.ArgumentParser(description="Augment wide.parquet with estimated current prices")
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)")
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--renovation-premium",
type=Path,
default=None,
help="Path to renovation_premium.parquet (optional)",
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
print("Loading wide.parquet...")
@ -49,7 +73,7 @@ def main():
)
df = df.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("_sector"),
sector_expr().alias("_sector"),
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
type_group_expr().alias("_type_group"),
)
@ -57,10 +81,14 @@ def main():
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups")
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)")
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
)
print("\nApplying repeat-sales index...")
@ -70,49 +98,63 @@ def main():
# Join type-specific index at sale year
df = df.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed")),
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("log_idx_sale_typed"),
),
left_on=["_sector", "_type_group", "_sale_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at sale year
df = df.join(
idx_all.select("sector", "year", pl.col("log_index").alias("log_idx_sale_all")),
idx_all.select(
"sector", "year", pl.col("log_index").alias("log_idx_sale_all")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at current year
df = df.join(
idx_typed.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")),
idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
),
left_on=["_sector", "_type_group"],
right_on=["sector", "type_group"],
how="left",
)
# Join "All" index at current year
df = df.join(
idx_all.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", pl.col("log_index").alias("log_idx_cur_all")),
idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("log_idx_cur_all")
),
left_on="_sector",
right_on="sector",
how="left",
)
df = df.with_columns(
pl.col("log_idx_sale_typed").fill_null(pl.col("log_idx_sale_all")).alias("_log_index_sale"),
pl.col("log_idx_cur_typed").fill_null(pl.col("log_idx_cur_all")).alias("_log_index_current"),
pl.col("log_idx_sale_typed")
.fill_null(pl.col("log_idx_sale_all"))
.alias("_log_index_sale"),
pl.col("log_idx_cur_typed")
.fill_null(pl.col("log_idx_cur_all"))
.alias("_log_index_current"),
)
else:
df = df.join(
index.select("sector", "year", pl.col("log_index").alias("_log_index_sale")),
index.select(
"sector", "year", pl.col("log_index").alias("_log_index_sale")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
index_current = (
index.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", pl.col("log_index").alias("_log_index_current"))
index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("_log_index_current")
)
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
@ -127,6 +169,224 @@ def main():
.alias("Estimated current price"),
)
n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
n_with_price = df.filter(has_price).height
print(
f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
)
# Apply hedonic blending if model provided
if args.hedonic_model is not None:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
tau = model.get("tau", 15.0)
print(f" tau = {tau}, {len(type_models)} type models")
# Add type_group for per-type lookup
df = df.with_columns(type_group_expr())
hedonic_mask = (
has_price
& pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible = df.filter(hedonic_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["_sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years and blend weight
sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
blend_w = hold_years / (hold_years + tau)
# Blend in log space
log_index_est = np.log(
eligible["Estimated current price"].to_numpy().astype(np.float64)
)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_est + blend_w * log_hedonic,
log_index_est,
)
blended_prices = np.exp(log_blended)
# Write back into df
eligible_indices = df.select(hedonic_mask).to_series().arg_true()
price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
for i, idx in enumerate(eligible_indices):
price_arr[idx] = blended_prices[i]
df = df.with_columns(
pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
)
n_blended = int(valid.sum())
avg_w = float(np.mean(blend_w[valid]))
print(
f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
)
else:
print(" No eligible properties for hedonic blending")
# Apply renovation premiums if provided
if args.renovation_premium is not None:
print("\nApplying renovation premiums...")
reno_prem = pl.read_parquet(args.renovation_premium)
print(f" Loaded {len(reno_prem):,} premium rows")
# Find properties with post-sale renovation events
has_reno = (
pl.col("renovation_history").is_not_null()
& (pl.col("renovation_history").list.len() > 0)
& pl.col("Estimated current price").is_not_null()
)
# Explode renovation events, filter to post-sale only
reno_rows = (
df.lazy()
.filter(has_reno)
.select("_sector", "_type_group", "_sale_year", "renovation_history")
.with_row_index("_row_idx")
.explode("renovation_history")
.with_columns(
pl.col("renovation_history").struct.field("year").alias("_event_year"),
pl.col("renovation_history").struct.field("event").alias("_event_type"),
)
.filter(pl.col("_event_year") > pl.col("_sale_year"))
.collect()
)
if len(reno_rows) > 0:
# Take most recent event per (row, event_type)
latest = (
reno_rows.lazy()
.group_by("_row_idx", "_event_type", "_sector", "_type_group")
.agg(pl.col("_event_year").max().alias("_event_year"))
.collect()
)
# Compute time-decayed premium
latest = latest.with_columns(
(-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
.exp()
.alias("_decay"),
)
# Join with renovation_premium.parquet — try typed first, fall back to "All"
rp_typed = reno_prem.filter(pl.col("type_group") != "All")
rp_all = reno_prem.filter(pl.col("type_group") == "All")
latest = (
latest.join(
rp_typed.select(
"sector",
"type_group",
"event_type",
pl.col("log_premium").alias("_lp_typed"),
),
left_on=["_sector", "_type_group", "_event_type"],
right_on=["sector", "type_group", "event_type"],
how="left",
)
.join(
rp_all.select(
"sector", "event_type", pl.col("log_premium").alias("_lp_all")
),
left_on=["_sector", "_event_type"],
right_on=["sector", "event_type"],
how="left",
)
.with_columns(
pl.col("_lp_typed")
.fill_null(pl.col("_lp_all"))
.fill_null(0.0)
.alias("_log_premium"),
)
)
# Compute total decayed log premium per property
per_property = (
latest.lazy()
.with_columns(
(pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
)
.group_by("_row_idx")
.agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
.collect()
)
# We need to map _row_idx back to the main df. Re-derive the row indices.
# _row_idx was generated from filtered rows — we need the actual df row indices.
reno_mask = df.select(has_reno).to_series()
actual_indices = reno_mask.arg_true()
# Build a mapping: _row_idx -> actual df row
idx_map = per_property.with_columns(
pl.col("_row_idx")
.map_elements(
lambda i: int(actual_indices[i]),
return_dtype=pl.UInt32,
)
.alias("_df_row"),
)
# Create a full-length column of zeros, then fill in premium values
reno_log_prem = [0.0] * len(df)
for row in idx_map.iter_rows(named=True):
reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
df = df.with_columns(
pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
)
# Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
df = df.with_columns(
pl.when(pl.col("_reno_log_premium") != 0.0)
.then(
pl.col("Estimated current price")
* pl.col("_reno_log_premium").exp()
)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
)
n_with_premium = idx_map.height
avg_multiplier = math.exp(
per_property["_reno_log_premium"]
.filter(per_property["_reno_log_premium"] != 0.0)
.mean()
)
print(f" {n_with_premium:,} properties with renovation premium applied")
print(
f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
)
else:
print(" No properties with post-sale renovation events")
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
@ -135,20 +395,19 @@ def main():
.alias("Est. price per sqm"),
)
n_adjusted = df.filter(
has_price & pl.col("_log_index_sale").is_not_null()
).height
n_with_price = df.filter(has_price).height
print(f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)")
# Drop all temporary columns
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
# Also drop hedonic-derived column if it was added
if "type_group" in df.columns:
temp_cols.append("type_group")
df = df.drop(temp_cols)
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
print(f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)
if __name__ == "__main__":

View file

@ -19,66 +19,38 @@ from scipy.sparse.linalg import lsqr
from scipy.spatial import KDTree
from tqdm import tqdm
from pipeline.transform._price_utils import (
CURRENT_YEAR,
SHRINKAGE_K,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
# --- Constants ---
MIN_PAIRS = 5
SHRINKAGE_K = 50
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
CURRENT_YEAR = 2025
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
def type_group_expr():
"""Polars expression: Property type → type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def age_band_expr():
"""Polars expression: Construction age (UInt16 year) → age band string."""
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
for i, brk in enumerate(AGE_BREAKS):
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
def sector_expr():
"""Polars expression: Postcode → sector (drop last 2 chars, strip)."""
return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
# --- Pair extraction ---
def extract_pairs(input_path: Path) -> pl.DataFrame:
print("Extracting repeat-sale pairs...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type")
.filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
@ -87,7 +59,9 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices").list.slice(0, pl.col("historical_prices").list.len() - 1).alias("from_txn"),
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
@ -98,10 +72,18 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select("sector", "type_group", "year1", "price1", "year2", "price2")
.filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
"weight"
),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
@ -118,31 +100,14 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
return pairs
# --- Sector centroids ---
def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids
# --- Robust IRLS solver ---
def solve_robust_index(
years1: np.ndarray, years2: np.ndarray,
log_ratios: np.ndarray, base_weights: np.ndarray,
years1: np.ndarray,
years2: np.ndarray,
log_ratios: np.ndarray,
base_weights: np.ndarray,
) -> dict[int, float]:
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
n = len(years1)
@ -205,11 +170,16 @@ def solve_robust_index(
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
groups = pairs.group_by(group_col).agg(
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
pl.col("year1"),
pl.col("year2"),
pl.col("log_ratio"),
pl.col("weight"),
)
indices = {}
n_pairs = {}
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"):
for row in tqdm(
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
):
key = row[group_col]
y1 = np.array(row["year1"], dtype=np.int32)
y2 = np.array(row["year2"], dtype=np.int32)
@ -224,28 +194,28 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
# --- Hedonic model ---
def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
def compute_hedonic_index(
input_path: Path, min_year: int, max_year: int
) -> dict[int, float]:
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
print("Computing hedonic index...")
df = (
pl.scan_parquet(input_path)
.select(
"Last known price", "Date of last transaction", "Property type",
"Total floor area (sqm)", "Current energy rating",
"Number of bedrooms & living rooms", "Construction age",
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
pl.col("Number of bedrooms & living rooms").is_not_null(),
pl.col("Construction age").is_not_null(),
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
age_band_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
@ -261,29 +231,9 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
sale_years = df["sale_year"].to_numpy()
# Build feature matrix
parts = []
# log(floor_area)
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
# Type dummies (ref: Detached)
tg = df["type_group"].to_numpy()
for t in ["Terraced", "Semi-Detached", "Flats"]:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
# EPC dummies (ref: D)
epc = df["Current energy rating"].to_numpy()
for r in ["A", "B", "C", "E", "F", "G"]:
parts.append((epc == r).astype(np.float32).reshape(-1, 1))
# Rooms
parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
# Age band dummies (ref: pre-1900)
ab = df["age_band"].to_numpy()
for band in AGE_LABELS[1:]:
parts.append((ab == band).astype(np.float32).reshape(-1, 1))
# Intercept
parts.append(np.ones((len(df), 1), dtype=np.float32))
F = np.hstack(parts)
# Build feature matrix (18 hedonic features + intercept)
X = build_hedonic_features(df)
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
# Step 1: regress log(price) on features → quality score
@ -303,12 +253,15 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
for y in hedonic:
hedonic[y] -= base
print(f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
print(
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
)
return hedonic
# --- Shrinkage ---
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
w = n_pairs / (n_pairs + k)
result = {}
@ -320,9 +273,18 @@ def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) ->
def apply_shrinkage(
sector_idx, sector_n, district_idx, district_n,
area_idx, area_n, national_idx, national_n,
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
):
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
# National → hedonic
@ -361,8 +323,11 @@ def apply_shrinkage(
# --- Spatial smoothing ---
def spatial_smooth(
sector_indices: dict, centroids: dict, n_pairs_map: dict,
sector_indices: dict,
centroids: dict,
n_pairs_map: dict,
) -> dict:
"""Blend sparse sector indices with K nearest neighbors."""
# Build coordinate arrays for sectors with centroids
@ -420,6 +385,7 @@ def spatial_smooth(
# --- Forward fill ---
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
filled = {}
last = 0.0
@ -432,8 +398,11 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
# --- Main ---
def main():
parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
parser = argparse.ArgumentParser(
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
@ -474,8 +443,10 @@ def main():
# National
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_robust_index(
np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
np_arrs["year1"].to_numpy(),
np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(),
np_arrs["weight"].to_numpy(),
)
national_n = len(typed)
print(f" National: {len(national_idx)} years")
@ -485,14 +456,25 @@ def main():
area_idx, area_n = compute_indices_for_level(typed, "area")
district_idx, district_n = compute_indices_for_level(typed, "district")
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
print(f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
print(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_idx, sector_n, district_idx, district_n,
area_idx, area_n, national_idx, national_n,
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
)
# Spatial smoothing
@ -519,15 +501,22 @@ def main():
result = pl.DataFrame(
rows,
schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
"log_index": pl.Float64, "n_pairs": pl.Int64},
schema={
"sector": pl.String,
"type_group": pl.String,
"year": pl.Int32,
"log_index": pl.Float64,
"n_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "year")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
print(
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
)
if __name__ == "__main__":

View file

@ -0,0 +1,572 @@
"""Estimate per-area renovation premiums from repeat-sale residuals.
For each repeat-sale pair, computes the residual after removing the price-index
predicted return. Pairs where renovation events occurred between sales should have
systematically higher residuals. A WLS regression estimates the log-premium per
event type, with hierarchical shrinkage and spatial smoothing.
Output: renovation_premium.parquet sector × type_group × event_type log_premium
"""
import argparse
import math
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform._price_utils import (
SHRINKAGE_K,
TYPE_GROUPS,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE
OUTLIER_THRESHOLD = 3.0
MIN_PAIRS = 10
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
EVENT_TYPES = ["Extension", "Renovation", "Remodeling"]
def extract_pairs_with_events(input_path: Path, index_path: Path) -> pl.DataFrame:
"""Extract repeat-sale pairs with renovation events and index residuals."""
print("Extracting repeat-sale pairs with renovation events...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type", "renovation_history")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
# Build consecutive pairs
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
.with_columns(
pl.col("from_txn").struct.field("year").alias("year1"),
pl.col("from_txn").struct.field("price").alias("price1"),
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select(
"sector",
"type_group",
"year1",
"price1",
"year2",
"price2",
"renovation_history",
)
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
print(f" {len(pairs):,} repeat-sale pairs")
# Join price index to compute residuals
index = pl.read_parquet(index_path)
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join at year1
pairs = pairs.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li1_typed")
),
left_on=["sector", "type_group", "year1"],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias("li1_all")),
left_on=["sector", "year1"],
right_on=["sector", "year"],
how="left",
)
# Join at year2
pairs = pairs.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li2_typed")
),
left_on=["sector", "type_group", "year2"],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias("li2_all")),
left_on=["sector", "year2"],
right_on=["sector", "year"],
how="left",
)
pairs = pairs.with_columns(
(pl.col("li1_typed").fill_null(pl.col("li1_all"))).alias("_li1"),
(pl.col("li2_typed").fill_null(pl.col("li2_all"))).alias("_li2"),
)
else:
pairs = pairs.join(
index.select("sector", "year", pl.col("log_index").alias("_li1")),
left_on=["sector", "year1"],
right_on=["sector", "year"],
how="left",
).join(
index.select("sector", "year", pl.col("log_index").alias("_li2")),
left_on=["sector", "year2"],
right_on=["sector", "year"],
how="left",
)
# Compute residual = log_ratio - (index2 - index1)
pairs = pairs.with_columns(
(
pl.col("log_ratio")
- (pl.col("_li2").fill_null(0.0) - pl.col("_li1").fill_null(0.0))
).alias("residual"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
"weight"
),
)
# For each pair, compute time-decayed renovation indicators
# Use row index for unique identification (composite keys aren't unique per pair)
pairs = pairs.with_row_index("_pair_idx")
for et in EVENT_TYPES:
col_name = f"has_{et.lower()}"
pairs = pairs.with_columns(pl.lit(0.0).alias(col_name))
# Process properties that have renovation history
has_reno = pairs.filter(
pl.col("renovation_history").is_not_null()
& (pl.col("renovation_history").list.len() > 0)
)
if len(has_reno) > 0:
reno_exploded = (
has_reno.select("_pair_idx", "year1", "year2", "renovation_history")
.explode("renovation_history")
.with_columns(
pl.col("renovation_history").struct.field("year").alias("event_year"),
pl.col("renovation_history").struct.field("event").alias("event_type"),
)
# Only events between the two sales
.filter(
(pl.col("event_year") > pl.col("year1"))
& (pl.col("event_year") <= pl.col("year2"))
)
)
if len(reno_exploded) > 0:
# For each pair + event type, take the most recent event
latest_events = reno_exploded.group_by(
"_pair_idx", "event_type", "year2"
).agg(pl.col("event_year").max().alias("latest_event_year"))
# Compute time-decayed indicator: exp(-decay_rate * (year2 - event_year))
latest_events = latest_events.with_columns(
(
-DECAY_RATE
* (pl.col("year2") - pl.col("latest_event_year")).cast(pl.Float64)
)
.exp()
.alias("decayed_indicator"),
)
# Pivot to wide format using _pair_idx for unique join
for et in EVENT_TYPES:
et_data = latest_events.filter(pl.col("event_type") == et)
if len(et_data) > 0:
col_name = f"has_{et.lower()}"
pairs = (
pairs.join(
et_data.select(
"_pair_idx",
pl.col("decayed_indicator").alias(f"_{col_name}"),
),
on="_pair_idx",
how="left",
)
.with_columns(
pl.col(f"_{col_name}").fill_null(0.0).alias(col_name),
)
.drop(f"_{col_name}")
)
pairs = pairs.drop("_pair_idx")
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
# Count reno pairs
reno_mask = (
(pl.col("has_extension") > 0)
| (pl.col("has_renovation") > 0)
| (pl.col("has_remodeling") > 0)
)
n_reno = pairs.filter(reno_mask).height
print(
f" {n_reno:,} pairs with renovation events ({n_reno / len(pairs) * 100:.1f}%)"
)
# Drop temporary columns from index join + renovation_history (no longer needed)
temp_cols = [
c
for c in pairs.columns
if c.startswith("_li") or c.startswith("li1_") or c.startswith("li2_")
]
pairs = pairs.drop(temp_cols + ["renovation_history"])
return pairs
def wls_regression(
residuals: np.ndarray,
weights: np.ndarray,
X: np.ndarray,
) -> np.ndarray:
"""Weighted least squares: residual ~ X (with intercept column in X).
Uses sqrt(weights) scaling to avoid building a full N×N diagonal matrix.
"""
sqrt_w = np.sqrt(weights)[:, np.newaxis]
Xw = X * sqrt_w
yw = residuals * sqrt_w.ravel()
try:
betas = np.linalg.lstsq(Xw, yw, rcond=None)[0]
except np.linalg.LinAlgError:
betas = np.zeros(X.shape[1])
return betas
def compute_premiums_for_group(df: pl.DataFrame) -> dict[str, float]:
"""Run WLS regression for a group, return {event_type: log_premium}."""
n = len(df)
if n < MIN_PAIRS:
return {}
residuals = df["residual"].to_numpy().astype(np.float64)
weights = df["weight"].to_numpy().astype(np.float64)
# Build design matrix: intercept + 3 event indicators
X = np.column_stack(
[
np.ones(n),
df["has_extension"].to_numpy().astype(np.float64),
df["has_renovation"].to_numpy().astype(np.float64),
df["has_remodeling"].to_numpy().astype(np.float64),
]
)
# Check if we have any renovation pairs in this group
reno_sum = X[:, 1:].sum()
if reno_sum < 1.0:
return {}
betas = wls_regression(residuals, weights, X)
# betas[0] is intercept, betas[1:4] are the premiums
return {
"Extension": float(betas[1]),
"Renovation": float(betas[2]),
"Remodeling": float(betas[3]),
}
def compute_premiums_for_level(
pairs: pl.DataFrame, group_col: str
) -> tuple[dict, dict]:
"""Compute premiums per group at a given hierarchy level.
Returns (premiums, n_reno_pairs) dicts keyed by group value.
premiums[key] = {event_type: log_premium}
"""
groups = pairs.group_by(group_col)
premiums = {}
n_reno_pairs = {}
for key, group_df in groups:
key_val = key[0]
result = compute_premiums_for_group(group_df)
if result:
premiums[key_val] = result
# Count pairs with any reno indicator
reno_mask = (
(group_df["has_extension"].to_numpy() > 0)
| (group_df["has_renovation"].to_numpy() > 0)
| (group_df["has_remodeling"].to_numpy() > 0)
)
n_reno_pairs[key_val] = int(reno_mask.sum())
return premiums, n_reno_pairs
def shrink_premium(
raw: dict[str, float], parent: dict[str, float], n: int
) -> dict[str, float]:
"""Shrink raw premiums toward parent level."""
w = n / (n + SHRINKAGE_K)
result = {}
for et in EVENT_TYPES:
r = raw.get(et, parent.get(et, 0.0))
p = parent.get(et, raw.get(et, 0.0))
result[et] = w * r + (1 - w) * p
return result
def apply_shrinkage(
sector_prem,
sector_n,
district_prem,
district_n,
area_prem,
area_n,
national_prem,
national_n,
all_sectors,
sector_to_dist,
dist_to_area,
):
"""Top-down hierarchical shrinkage for premiums."""
# Area -> national
area_shrunk = {}
for area, prem in area_prem.items():
area_shrunk[area] = shrink_premium(prem, national_prem, area_n.get(area, 0))
# District -> area
district_shrunk = {}
for dist, prem in district_prem.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, national_prem)
district_shrunk[dist] = shrink_premium(prem, parent, district_n.get(dist, 0))
# Sector -> district
sector_shrunk = {}
for sec, prem in sector_prem.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, national_prem)
sector_shrunk[sec] = shrink_premium(prem, parent, sector_n.get(sec, 0))
# Fill missing sectors
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(
d, area_shrunk.get(a, national_prem)
)
return sector_shrunk
def spatial_smooth(
sector_premiums: dict[str, dict[str, float]],
centroids: dict[str, tuple[float, float]],
n_reno_map: dict[str, int],
) -> dict[str, dict[str, float]]:
"""Blend sparse sector premiums with K nearest neighbors."""
sectors_with_coords = [s for s in sector_premiums if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_premiums
coords = np.array([centroids[s] for s in sectors_with_coords])
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_premiums)
for i, sec in enumerate(sectors_with_coords):
n = n_reno_map.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_prems = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_premiums:
inv_dists.append(1.0 / d)
neighbor_prems.append(sector_premiums[ns])
if not neighbor_prems:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
ws = [iw / total_inv * nbr_w for iw in inv_dists]
blended = {}
for et in EVENT_TYPES:
val = self_w * sector_premiums[sec].get(et, 0.0)
for np_dict, w in zip(neighbor_prems, ws):
val += w * np_dict.get(et, 0.0)
blended[et] = val
result[sec] = blended
return result
def main():
parser = argparse.ArgumentParser(
description="Estimate renovation premiums from repeat-sale residuals"
)
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output renovation_premium.parquet"
)
args = parser.parse_args()
pairs = extract_pairs_with_events(args.input, args.index)
centroids = extract_centroids(args.input)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
all_type_groups = ["All"] + TYPE_GROUPS
rows = []
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
continue
print(f" {len(typed):,} pairs")
# National
national_prem = compute_premiums_for_group(typed)
national_reno = typed.filter(
(pl.col("has_extension") > 0)
| (pl.col("has_renovation") > 0)
| (pl.col("has_remodeling") > 0)
).height
if not national_prem:
print(" No renovation pairs at national level, skipping")
continue
print(
" National premiums: "
+ ", ".join(
f"{et}: {v:.4f} ({math.exp(v) - 1:.1%})"
for et, v in national_prem.items()
)
)
# Per-level
print(" Computing per-level premiums:")
area_prem, area_n = compute_premiums_for_level(typed, "area")
district_prem, district_n = compute_premiums_for_level(typed, "district")
sector_prem, sector_n = compute_premiums_for_level(typed, "sector")
print(
f" {len(area_prem)} areas, {len(district_prem)} districts, {len(sector_prem)} sectors with data"
)
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_prem,
sector_n,
district_prem,
district_n,
area_prem,
area_n,
national_prem,
national_reno,
all_sectors,
sector_to_dist,
dist_to_area,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
# Collect rows
for sec in all_sectors:
prem = sector_smoothed.get(sec, national_prem)
n = sector_n.get(sec, 0)
for et in EVENT_TYPES:
rows.append((sec, tg, et, prem.get(et, 0.0), n))
result = pl.DataFrame(
rows,
schema={
"sector": pl.String,
"type_group": pl.String,
"event_type": pl.String,
"log_premium": pl.Float64,
"n_reno_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "event_type")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(
f" {result['sector'].n_unique():,} sectors x {len(all_type_groups)} types x {len(EVENT_TYPES)} events = {len(result):,} rows"
)
# Print summary statistics
print("\nNational premium summary:")
national = (
result.filter(pl.col("type_group") == "All")
.group_by("event_type")
.agg(
pl.col("log_premium").mean().alias("mean_log_premium"),
)
)
for row in national.iter_rows(named=True):
et = row["event_type"]
lp = row["mean_log_premium"]
print(f" {et}: log_premium={lp:.4f} ({math.exp(lp) - 1:.1%} price uplift)")
if __name__ == "__main__":
main()