This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -0,0 +1,417 @@
"""Add online buy/rent listings to wide.parquet as new rows.
Matches online listings to existing historical rows by postcode + fuzzy address,
carrying over historical prices and area-level data for matched properties.
Unmatched listings get area data from any same-postcode row in wide.
Modifies wide.parquet in-place, adding:
- A `Listing status` column to all rows ("Historical sale" / "For sale" / "For rent")
- New columns: Asking price, Asking rent (monthly), Bedrooms, Bathrooms,
Listing date, Property sub-type, Listing URL, Price qualifier
"""
import argparse
import re
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.utils.fuzzy_join import _numbers_compatible
_NORMALIZE_RE = re.compile(r"[,.\-]")
_WHITESPACE_RE = re.compile(r"\s+")
# Columns that are property-specific (carried from matched historical row only)
_PROPERTY_COLUMNS = [
"Last known price",
"Date of last transaction",
"historical_prices",
"renovation_history",
"Construction age",
"Is construction date approximate",
"Current energy rating",
"Potential energy rating",
"Address per EPC",
"Interior height (m)",
"Number of bedrooms & living rooms",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
# Columns that are area-level (carried from matched row, or any same-postcode row)
_AREA_COLUMNS = [
"Public transport to Bank (mins)",
"Cycling to Bank (mins)",
"Public transport to Fitzrovia (mins)",
"Cycling to Fitzrovia (mins)",
"Income Score (rate)",
"Employment Score (rate)",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
"% Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
"Estimated monthly rent",
"Criminal damage and arson (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Drugs (avg/yr)",
"Anti-social behaviour (avg/yr)",
"Public order (avg/yr)",
"Other crime (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Theft from the person (avg/yr)",
"Possession of weapons (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Robbery (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of public transport stations within 2km",
"Noise (dB)",
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Max available download speed (Mbps)",
"Collapsible deposits risk",
"Compressible ground risk",
"Landslide risk",
"Running sand risk",
"Shrink-swell risk",
"Soluble rocks risk",
"Environmental risk",
]
def _normalize(s: str) -> str:
return _WHITESPACE_RE.sub(" ", _NORMALIZE_RE.sub(" ", s.upper())).strip()
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
wide_entries, online_entries = args
pairs = []
for wide_idx, wide_address in wide_entries:
for online_idx, online_address in online_entries:
if not _numbers_compatible(wide_address, online_address):
continue
score = fuzz.token_sort_ratio(wide_address, online_address)
pairs.append((score, online_idx, wide_idx))
return pairs
def _load_online(buy_path: Path, rent_path: Path) -> pl.DataFrame:
"""Load buy + rent parquets, tag with channel, normalize rent to monthly."""
buy = pl.scan_parquet(buy_path).with_columns(
pl.lit("For sale").alias("_channel"),
)
rent = pl.scan_parquet(rent_path).with_columns(
pl.lit("For rent").alias("_channel"),
)
online = pl.concat([buy, rent]).collect()
# Normalize rent prices to monthly
freq = online["price_frequency"]
price = online["price"].cast(pl.Float64)
monthly_price = (
pl.when(freq == "weekly")
.then(price * 52.0 / 12.0)
.when(freq == "yearly")
.then(price / 12.0)
.when(freq == "daily")
.then(price * 365.25 / 12.0)
.when(freq == "quarterly")
.then(price / 3.0)
.otherwise(price) # monthly, not specified
.round(0)
.cast(pl.Int64)
)
online = online.with_columns(
pl.when(pl.col("_channel") == "For sale")
.then(pl.col("price"))
.otherwise(None)
.alias("Asking price"),
pl.when(pl.col("_channel") == "For rent")
.then(monthly_price)
.otherwise(None)
.alias("Asking rent (monthly)"),
)
return online
def _match_online_to_wide(
wide: pl.DataFrame,
online: pl.DataFrame,
) -> dict[int, int]:
"""Match online listings to wide rows by postcode + fuzzy address.
Returns dict mapping online row index wide row index.
"""
# Build postcode → [(row_idx, normalized_address)] for wide
wide_postcodes = wide["Postcode"]
wide_addresses = wide["Address per Property Register"]
wide_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(wide.height):
pc = wide_postcodes[i]
addr = wide_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
wide_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build postcode → [(row_idx, normalized_address)] for online
online_postcodes = online["postcode"]
online_addresses = online["address"]
online_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(online.height):
pc = online_postcodes[i]
addr = online_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
online_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build tasks: only postcodes present in both
tasks = [
(wide_by_postcode[pc], online_entries)
for pc, online_entries in online_by_postcode.items()
if pc in wide_by_postcode
]
# Score in parallel
all_pairs: list[tuple[int, int, int]] = []
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc="Matching online listings",
):
all_pairs.extend(pairs)
del tasks, wide_by_postcode, online_by_postcode
# Greedy assignment: best score first, one-to-one
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
matches: dict[int, int] = {} # online_idx → wide_idx
matched_wide: set[int] = set()
for _score, online_idx, wide_idx in all_pairs:
if online_idx in matches or wide_idx in matched_wide:
continue
matches[online_idx] = wide_idx
matched_wide.add(wide_idx)
return matches
def _build_postcode_area_lookup(wide: pl.DataFrame) -> dict[str, int]:
"""Build postcode → first row index for area data fallback."""
postcodes = wide["Postcode"]
lookup: dict[str, int] = {}
for i in range(wide.height):
pc = postcodes[i]
if pc is not None:
pc_upper = pc.strip().upper()
if pc_upper not in lookup:
lookup[pc_upper] = i
return lookup
def _build_online_rows(
wide: pl.DataFrame,
online: pl.DataFrame,
matches: dict[int, int],
postcode_lookup: dict[str, int],
) -> pl.DataFrame:
"""Build a DataFrame of online listing rows with all wide.parquet columns."""
wide_schema = wide.schema
n = online.height
# Initialize all columns as null lists
columns: dict[str, list] = {col: [None] * n for col in wide_schema}
# Add new columns
columns["Listing status"] = [None] * n
columns["Asking price"] = [None] * n
columns["Asking rent (monthly)"] = [None] * n
columns["Bedrooms"] = [None] * n
columns["Bathrooms"] = [None] * n
columns["Listing date"] = [None] * n
columns["Property sub-type"] = [None] * n
columns["Listing URL"] = [None] * n
columns["Price qualifier"] = [None] * n
for i in range(n):
# Direct mappings from online listing
columns["Address per Property Register"][i] = online["address"][i]
columns["Postcode"][i] = online["postcode"][i]
columns["lat"][i] = online["latitude"][i]
columns["lon"][i] = online["longitude"][i]
columns["Property type"][i] = online["property_type"][i]
columns["Leashold/Freehold"][i] = online["tenure"][i]
columns["Total floor area (sqm)"][i] = online["floorspace_sqm"][i]
# New columns
columns["Listing status"][i] = online["_channel"][i]
columns["Asking price"][i] = online["Asking price"][i]
columns["Asking rent (monthly)"][i] = online["Asking rent (monthly)"][i]
columns["Bedrooms"][i] = online["bedrooms"][i]
columns["Bathrooms"][i] = online["bathrooms"][i]
columns["Property sub-type"][i] = online["property_sub_type"][i]
columns["Listing URL"][i] = online["url"][i]
columns["Price qualifier"][i] = online["price_qualifier"][i]
# Parse listing date
fvd = online["first_visible_date"][i]
if fvd is not None:
try:
from datetime import datetime
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
columns["Listing date"][i] = dt.replace(tzinfo=None)
except (ValueError, TypeError):
pass
# Determine source row for carried data
matched_wide_idx = matches.get(i)
postcode = online["postcode"][i]
pc_upper = postcode.strip().upper() if postcode else None
area_source_idx = matched_wide_idx
if area_source_idx is None and pc_upper is not None:
area_source_idx = postcode_lookup.get(pc_upper)
# Copy property-specific columns from matched row only
if matched_wide_idx is not None:
for col in _PROPERTY_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][matched_wide_idx]
# Copy area columns from matched row or same-postcode fallback
if area_source_idx is not None:
for col in _AREA_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][area_source_idx]
# Build DataFrame with correct types
series_list = []
for col_name, dtype in wide_schema.items():
series_list.append(pl.Series(col_name, columns[col_name], dtype=dtype))
# New columns with their types
series_list.append(
pl.Series("Listing status", columns["Listing status"], dtype=pl.String)
)
series_list.append(
pl.Series("Asking price", columns["Asking price"], dtype=pl.Int64)
)
series_list.append(
pl.Series(
"Asking rent (monthly)", columns["Asking rent (monthly)"], dtype=pl.Int64
)
)
series_list.append(pl.Series("Bedrooms", columns["Bedrooms"], dtype=pl.Int32))
series_list.append(pl.Series("Bathrooms", columns["Bathrooms"], dtype=pl.Int32))
series_list.append(
pl.Series("Listing date", columns["Listing date"], dtype=pl.Datetime("us"))
)
series_list.append(
pl.Series("Property sub-type", columns["Property sub-type"], dtype=pl.String)
)
series_list.append(
pl.Series("Listing URL", columns["Listing URL"], dtype=pl.String)
)
series_list.append(
pl.Series("Price qualifier", columns["Price qualifier"], dtype=pl.String)
)
return pl.DataFrame(series_list)
def main():
parser = argparse.ArgumentParser(
description="Add online buy/rent listings to wide.parquet"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="wide.parquet path (modified in-place)",
)
parser.add_argument(
"--buy", type=Path, required=True, help="rightmove_buy.parquet path"
)
parser.add_argument(
"--rent", type=Path, required=True, help="rightmove_rent.parquet path"
)
args = parser.parse_args()
print("Loading wide.parquet...")
wide = pl.read_parquet(args.input)
print(f" {wide.height} rows, {wide.width} columns")
print("Loading online listings...")
online = _load_online(args.buy, args.rent)
print(
f" {online.height} online listings ({online.filter(pl.col('_channel') == 'For sale').height} buy, {online.filter(pl.col('_channel') == 'For rent').height} rent)"
)
print("Matching online listings to historical rows...")
matches = _match_online_to_wide(wide, online)
print(f" {len(matches)} online listings matched to historical rows")
print("Building postcode area lookup...")
postcode_lookup = _build_postcode_area_lookup(wide)
print("Building online listing rows...")
online_rows = _build_online_rows(wide, online, matches, postcode_lookup)
print(f" {online_rows.height} online rows built")
# Add Listing status + new columns to existing wide rows
wide = wide.with_columns(
pl.lit("Historical sale").alias("Listing status"),
pl.lit(None, dtype=pl.Int64).alias("Asking price"),
pl.lit(None, dtype=pl.Int64).alias("Asking rent (monthly)"),
pl.lit(None, dtype=pl.Int32).alias("Bedrooms"),
pl.lit(None, dtype=pl.Int32).alias("Bathrooms"),
pl.lit(None, dtype=pl.Datetime("us")).alias("Listing date"),
pl.lit(None, dtype=pl.String).alias("Property sub-type"),
pl.lit(None, dtype=pl.String).alias("Listing URL"),
pl.lit(None, dtype=pl.String).alias("Price qualifier"),
)
# Concat
result = pl.concat([wide, online_rows], how="diagonal_relaxed")
print(f"Final: {result.height} rows, {result.width} columns")
# Verify
status_counts = (
result["Listing status"].value_counts().sort("count", descending=True)
)
print(f"Listing status distribution:\n{status_counts}")
result.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"Wrote {args.input} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -7,6 +7,7 @@ from ..utils import fuzzy_join_on_postcode
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
def main():
@ -117,7 +118,9 @@ def main():
.collect()
)
event_counts = events["renovation_history"].explode().struct.field("event").value_counts()
event_counts = (
events["renovation_history"].explode().struct.field("event").value_counts()
)
print(f"Renovation events: {events.height} properties with events")
print(event_counts)
@ -159,6 +162,7 @@ def main():
"old_new",
)
.filter(pl.col("pp_property_type") != "Other")
.filter(pl.col("price") >= MIN_PRICE)
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
@ -171,6 +175,7 @@ def main():
.agg(
pl.struct(
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
"price",
).alias("historical_prices"),
pl.col("pp_property_type").last(),

View file

@ -3,7 +3,8 @@ import argparse
import polars as pl
from pathlib import Path
MIN_PRICE = 10_000
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
@ -45,20 +46,23 @@ def _build_wide(
rental_prices_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
wide = (
pl.scan_parquet(epc_pp_path)
.filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
.filter(
pl.col("latest_price").is_null() | (pl.col("latest_price") >= MIN_PRICE)
)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
# Remap terminated postcodes to nearest active successor
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = wide.join(
postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
).with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
).drop("new_postcode")
arcgis = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
.select(
pl.col("pcds").alias("postcode"),
"lat",
@ -67,7 +71,7 @@ def _build_wide(
"oa21",
)
)
wide = wide.join(arcgis, on="postcode", how="full", coalesce=True)
wide = wide.join(arcgis, on="postcode", how="left")
wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
@ -147,11 +151,6 @@ def _build_wide(
.with_columns(
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
)
.with_columns(
pl.col("noise_lden_db")
.fill_null(pl.col("noise_lden_db").min())
.alias("noise_lden_db"),
)
.select("postcode", "noise_lden_db")
)
wide = wide.join(noise, on="postcode", how="left")
@ -181,7 +180,7 @@ def _build_wide(
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")

View file

@ -8,11 +8,12 @@ import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode
# POI category groups for proximity counting
# POI category groups for proximity counting.
# Names must match the friendly names produced by transform_poi.py / naptan.py.
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
"parks": ["Park", "Garden", "Nature Reserve"],
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
"parks": ["Park"],
"public_transport": [
"Metro or Tram stop",
"Rail station",

View file

@ -0,0 +1,292 @@
"""Backtest price estimation on held-out recent sales.
Uses temporal holdout: index built from pairs before TEST_YEAR_MIN only.
Test set: properties with 2+ sales where the last sale >= TEST_YEAR_MIN.
Evaluates: Naive vs Index vs kNN vs Blended.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.index import build_index
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
build_knn_pool,
knn_median_psm,
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
)
TEST_YEAR_MIN = 2022
def extract_test_set(input_path: Path) -> pl.DataFrame:
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
print("Loading test set...")
df = (
pl.scan_parquet(input_path)
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
sector_expr(),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices")
.list.last()
.struct.field("year")
.alias("actual_year"),
pl.col("historical_prices")
.list.last()
.struct.field("month")
.alias("actual_month"),
pl.col("historical_prices")
.list.last()
.struct.field("price")
.alias("actual_price"),
# Second-to-last sale (input)
pl.col("historical_prices")
.list.get(-2)
.struct.field("year")
.alias("input_year"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("month")
.alias("input_month"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("price")
.alias("input_price"),
)
.with_columns(
(
pl.col("actual_year").cast(pl.Float64)
+ (pl.col("actual_month").cast(pl.Float64) - 1.0) / 12.0
).alias("actual_frac_year"),
(
pl.col("input_year").cast(pl.Float64)
+ (pl.col("input_month").cast(pl.Float64) - 1.0) / 12.0
).alias("input_frac_year"),
)
.filter(
pl.col("actual_year") >= TEST_YEAR_MIN,
pl.col("input_price") > 0,
pl.col("actual_price") > 0,
pl.col("actual_frac_year") > pl.col("input_frac_year"),
)
.collect()
)
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
return df
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction with interpolation, capping, and seasonal adjustment."""
test = interpolate_log_index(
index, test, "sector", "type_group", "input_frac_year", "log_index_input"
)
test = interpolate_log_index(
index, test, "sector", "type_group", "actual_frac_year", "log_index_actual"
)
test = test.with_columns(
(
pl.col("input_price").cast(pl.Float64)
* (pl.col("log_index_actual") - pl.col("log_index_input"))
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
)
return test
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
actual = actual[valid]
predicted = predicted[valid]
ape = np.abs(predicted - actual) / actual
signed_err = predicted - actual
return {
"MdAPE (%)": float(np.median(ape) * 100),
"% within 10%": float(np.mean(ape <= 0.10) * 100),
"% within 20%": float(np.mean(ape <= 0.20) * 100),
"% within 30%": float(np.mean(ape <= 0.30) * 100),
"MAE (£)": float(np.mean(np.abs(signed_err))),
"Mean signed error (£)": float(np.mean(signed_err)),
"n": int(len(actual)),
}
def print_metrics_table(metrics_by_stage: dict):
stages = list(metrics_by_stage.keys())
col_w = 15
width = 25 + col_w * len(stages)
print("\n" + "=" * width)
print(f"BACKTEST RESULTS (holdout: sales >= {TEST_YEAR_MIN})")
print("=" * width)
metric_names = [
"MdAPE (%)",
"% within 10%",
"% within 20%",
"% within 30%",
"MAE (£)",
"Mean signed error (£)",
"n",
]
header = f"{'Metric':<25s}"
for stage in stages:
header += f" {stage:>{col_w - 1}s}"
print(header)
print("-" * width)
for metric in metric_names:
row = f"{metric:<25s}"
for stage in stages:
val = metrics_by_stage[stage][metric]
if metric == "n":
row += f" {val:>{col_w - 1},d}"
elif "£" in metric:
row += f" {val:>{col_w - 2},.0f}"
else:
row += f" {val:>{col_w - 2}.1f}%"
print(row)
print("=" * width)
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
)
args = parser.parse_args()
# Build index from pre-test data only (temporal holdout)
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
print(
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
# Compute seasonal factors from pre-test data only
seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
test = extract_test_set(args.input)
# Compute seasonal adjustment for each test pair
input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
test = test.with_columns(
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
print("\nPredicting with price index...")
test = predict(test, index)
# --- kNN ---
ref_fy = float(TEST_YEAR_MIN)
trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
# Interpolate log_index at reference year for temporal adjustment
test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))
test = interpolate_log_index(
index, test, "sector", "type_group", "_ref_fy", "_log_index_ref"
)
lat = test["lat"].cast(pl.Float64).to_numpy()
lon = test["lon"].cast(pl.Float64).to_numpy()
tg = test["type_group"].to_numpy()
fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
print("\nComputing kNN estimates...")
knn_psm = knn_median_psm(trees, lat, lon, tg)
# Temporal adjustment: pool PSM is at ref, adjust to actual
log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)
log_idx_ref = test["_log_index_ref"].to_numpy().astype(np.float64)
temporal_adj = np.where(
np.isfinite(log_idx_actual) & np.isfinite(log_idx_ref),
np.exp(log_idx_actual - log_idx_ref),
1.0,
)
knn_est = knn_psm * fa * temporal_adj
n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
print(f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
# Blend: (1-w)*index + w*kNN where both available
index_est = test["predicted"].to_numpy().astype(np.float64)
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
blended = np.where(
knn_valid & np.isfinite(index_est),
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
np.where(np.isfinite(index_est), index_est, knn_est),
)
actual = test["actual_price"].to_numpy().astype(np.float64)
metrics = {
"Naive": compute_metrics(
actual, test["input_price"].to_numpy().astype(np.float64)
),
"Index": compute_metrics(actual, index_est),
"kNN": compute_metrics(actual, knn_est),
"Blended": compute_metrics(actual, blended),
}
print_metrics_table(metrics)
# Save results
result = test.select(
"Postcode",
"sector",
"input_year",
"input_frac_year",
"input_price",
"actual_year",
"actual_frac_year",
"actual_price",
"predicted",
).with_columns(
pl.Series("knn_predicted", knn_est, dtype=pl.Float64),
pl.Series("blended", blended, dtype=pl.Float64),
)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {len(result):,} rows")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,204 @@
"""Augment wide.parquet with estimated current prices.
For properties with a known prior sale, applies the repeat-sales price index
to adjust the last known price to the current date, then blends with kNN
estimates from nearby recently-sold properties. Includes:
- Capping extreme index adjustments
- Seasonal month-of-sale adjustment
- kNN spatial blending
Modifies wide.parquet in-place.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
build_knn_pool,
knn_median_psm,
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
CURRENT_MONTH,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
)
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Compute seasonal factors
seasonal = compute_seasonal_factors(args.input)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
sale_month = (
df["Date of last transaction"]
.dt.month()
.fill_null(6)
.to_numpy()
.astype(np.int32)
)
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
# Derive helper columns
df = df.with_columns(
sector_expr().alias("_sector"),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
/ 12.0
).alias("_sale_frac_year"),
type_group_expr().alias("_type_group"),
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
index = pl.read_parquet(args.index)
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
print("\nApplying repeat-sales index with fractional year interpolation...")
df = interpolate_log_index(
index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp"
)
df = interpolate_log_index(
index,
df,
"_sector",
"_type_group",
"_current_frac_year",
"_log_index_current_interp",
)
# Compute index-adjusted estimate with cap and seasonal adjustment
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
& pl.col("Date of last transaction").is_not_null()
)
df = df.with_columns(
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (
pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
)
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height
n_with_price = df.filter(has_price).height
print(
f" {n_estimated:,} of {n_with_price:,} properties estimated "
f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)"
)
# --- kNN blending ---
print("\nBuilding kNN estimates...")
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
lat = df["lat"].cast(pl.Float64).to_numpy()
lon = df["lon"].cast(pl.Float64).to_numpy()
tg = df["_type_group"].fill_null("").to_numpy()
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
knn_psm = knn_median_psm(trees, lat, lon, tg)
knn_est = knn_psm * fa # No temporal adj: ref == current
df = df.with_columns(
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
)
# Blend: where kNN available, use weighted average; else keep index
df = df.with_columns(
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("_knn_est").is_not_null()
& pl.col("_knn_est").is_finite()
& (pl.col("_knn_est") > 0)
)
.then(
(1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
+ KNN_BLEND_WEIGHT * pl.col("_knn_est")
)
.when(pl.col("Estimated current price").is_not_null())
.then(pl.col("Estimated current price"))
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_blended = df.filter(
pl.col("_knn_est").is_not_null()
& pl.col("_knn_est").is_finite()
& (pl.col("_knn_est") > 0)
& pl.col("Estimated current price").is_not_null()
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
.alias("Est. price per sqm"),
)
# Drop all temporary columns
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols)
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,465 @@
"""Hierarchical repeat-sales price index.
Stratified by property type and postcode sector, with IRLS Huber regression,
hierarchical shrinkage (sector district area national hedonic),
and KD-tree spatial smoothing for sparse sectors.
Output: price_index.parquet sector x type_group x year -> log_index
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import lsqr
from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
shrink_dicts,
spatial_smooth,
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5
def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFrame:
"""Extract consecutive repeat-sale pairs.
If max_year2 is set, only pairs where year2 < max_year2 are included
(for temporal holdout in backtesting).
"""
print("Extracting repeat-sale pairs...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
.with_columns(
pl.col("from_txn").struct.field("year").alias("year1"),
pl.col("from_txn").struct.field("month").alias("month1"),
pl.col("from_txn").struct.field("price").alias("price1"),
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("month").alias("month2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.with_columns(
(
pl.col("year1").cast(pl.Float64)
+ (pl.col("month1").cast(pl.Float64) - 1.0) / 12.0
).alias("frac_year1"),
(
pl.col("year2").cast(pl.Float64)
+ (pl.col("month2").cast(pl.Float64) - 1.0) / 12.0
).alias("frac_year2"),
)
.select(
"sector",
"type_group",
"year1",
"price1",
"year2",
"price2",
"frac_year1",
"frac_year2",
)
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("frac_year2") > pl.col("frac_year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
(
1.0
/ (pl.col("frac_year2") - pl.col("frac_year1"))
.cast(pl.Float64)
.sqrt()
).alias("weight"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
if max_year2 is not None:
pairs = pairs.filter(pl.col("year2") < max_year2)
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
print(f" {len(pairs):,} pairs extracted")
return pairs
def solve_robust_index(
years1: np.ndarray,
years2: np.ndarray,
log_ratios: np.ndarray,
base_weights: np.ndarray,
) -> dict[int, float]:
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
n = len(years1)
if n < MIN_PAIRS:
return {}
all_years = np.union1d(years1, years2)
min_year = int(all_years.min())
col = 0
year_to_col = {}
for y in all_years:
iy = int(y)
if iy != min_year:
year_to_col[iy] = col
col += 1
n_cols = len(year_to_col)
if n_cols == 0:
return {}
# Vectorized column index mapping
col2 = np.full(n, -1, dtype=np.int32)
col1 = np.full(n, -1, dtype=np.int32)
for year, c in year_to_col.items():
col2[years2 == year] = c
col1[years1 == year] = c
# Sparse matrix structure (fixed across iterations)
mask2 = col2 >= 0
mask1 = col1 >= 0
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals
predicted = np.zeros(n)
predicted[mask2] += betas[col2[mask2]]
predicted[mask1] -= betas[col1[mask1]]
residuals = log_ratios - predicted
# Huber reweighting
abs_r = np.abs(residuals)
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
weights = base_weights * huber_w
index = {min_year: 0.0}
for year, c in year_to_col.items():
index[year] = float(betas[c])
return index
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
groups = pairs.group_by(group_col).agg(
pl.col("year1"),
pl.col("year2"),
pl.col("log_ratio"),
pl.col("weight"),
)
indices = {}
n_pairs = {}
for row in tqdm(
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
):
key = row[group_col]
y1 = np.array(row["year1"], dtype=np.int32)
y2 = np.array(row["year2"], dtype=np.int32)
lr = np.array(row["log_ratio"], dtype=np.float64)
w = np.array(row["weight"], dtype=np.float64)
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
return indices, n_pairs
def compute_hedonic_index(
input_path: Path,
min_year: int,
max_year: int,
max_sale_year: int | None = None,
) -> dict[int, float]:
"""Quality-adjusted hedonic index: regress log(price) on features, average residual by year.
Used as the ultimate shrinkage fallback for the repeat-sales index.
If max_sale_year is set, only sales before that year are used (backtesting holdout).
"""
effective_max = max_sale_year - 1 if max_sale_year is not None else max_year
print("Computing hedonic index...")
df = (
pl.scan_parquet(input_path)
.select(
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= effective_max,
)
.collect()
)
print(f" {len(df):,} complete cases for hedonic model")
# Target
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
sale_years = df["sale_year"].to_numpy()
# Build feature matrix (5 hedonic features + intercept)
X = build_hedonic_features(df)
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
print(f" Feature matrix: {F.shape[0]:,} x {F.shape[1]}")
# Step 1: regress log(price) on features -> quality score
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
quality_score = F.astype(np.float64) @ betas
residuals = log_price - quality_score
# Step 2: average residual by year = hedonic index
hedonic = {}
for y in range(min_year, max_year + 1):
mask = sale_years == y
if mask.sum() > 0:
hedonic[y] = float(np.mean(residuals[mask]))
# Normalize: min_year = 0
base = hedonic.get(min_year, 0.0)
for y in hedonic:
hedonic[y] -= base
print(
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
)
return hedonic
EXTRAPOLATION_YEARS = 3
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
"""Forward-fill missing years, with linear extrapolation beyond last known year."""
if not index:
return {y: 0.0 for y in range(min_year, max_year + 1)}
sorted_years = sorted(index.keys())
last_known_year = sorted_years[-1]
# Forward fill up to last known year
filled = {}
last = 0.0
for y in range(min_year, last_known_year + 1):
if y in index:
last = index[y]
filled[y] = last
# Linear extrapolation beyond last known year
if last_known_year < max_year:
recent = [
(y, index[y])
for y in sorted_years
if y >= last_known_year - EXTRAPOLATION_YEARS
]
if len(recent) >= 2:
years_arr = np.array([r[0] for r in recent], dtype=np.float64)
vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
slope = np.polyfit(years_arr, vals_arr, 1)[0]
for y in range(last_known_year + 1, max_year + 1):
filled[y] = index[last_known_year] + slope * (y - last_known_year)
else:
for y in range(last_known_year + 1, max_year + 1):
filled[y] = index[last_known_year]
return filled
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
centroids = extract_centroids(input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
hedonic_idx = compute_hedonic_index(
input_path, min_year, max_year, max_sale_year=max_pair_year
)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Process each type group + "All"
all_type_groups = ["All"] + TYPE_GROUPS
final = {} # {type_group: {sector: {year: log_index}}}
final_n = {} # {type_group: {sector: n_pairs}}
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
final_n[tg] = {s: 0 for s in all_sectors}
continue
print(f" {len(typed):,} pairs")
# National
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_robust_index(
np_arrs["year1"].to_numpy(),
np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(),
np_arrs["weight"].to_numpy(),
)
national_n = len(typed)
print(f" National: {len(national_idx)} years")
# Area, district, sector
print(" Computing per-level indices:")
area_idx, area_n = compute_indices_for_level(typed, "area")
district_idx, district_n = compute_indices_for_level(typed, "district")
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
print(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Shrinkage: national -> hedonic first, then hierarchical
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
sector_shrunk = hierarchical_shrinkage(
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_shrunk,
all_sectors,
sector_to_dist,
dist_to_area,
shrink_dicts,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(
sector_shrunk, centroids, sector_n, blend_dicts
)
# Forward fill
for sec in all_sectors:
sector_smoothed[sec] = forward_fill(
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
)
final[tg] = sector_smoothed
final_n[tg] = sector_n
# Assemble output
print("\nAssembling output...")
rows = []
for tg in all_type_groups:
for sec in all_sectors:
n = final_n[tg].get(sec, 0)
for year, log_idx in final[tg][sec].items():
rows.append((sec, tg, year, log_idx, n))
return pl.DataFrame(
rows,
schema={
"sector": pl.String,
"type_group": pl.String,
"year": pl.Int32,
"log_index": pl.Float64,
"n_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "year")
def main():
parser = argparse.ArgumentParser(
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
result = build_index(args.input)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(
f" {result['sector'].n_unique():,} sectors x {result['type_group'].n_unique()} types x {result['year'].n_unique()} years = {len(result):,} rows"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,161 @@
"""kNN price estimation using nearby recently-sold properties.
For each target property, finds k nearest sold properties of the same type,
computes the median index-adjusted price-per-sqm, and multiplies by the
target's floor area to produce an estimate.
"""
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform.price_estimation.utils import (
TYPE_GROUPS,
interpolate_log_index,
sector_expr,
type_group_expr,
)
KNN_K = 20
KNN_MIN_NEIGHBORS = 5
KNN_BLEND_WEIGHT = 0.35
def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
"""Equirectangular projection: scale lon by cos(lat) for approximate distances."""
return np.column_stack([lat, lon * np.cos(np.radians(lat))])
def build_knn_pool(
input_path: Path,
index: pl.DataFrame,
ref_frac_year: float,
max_sale_year: int | None = None,
) -> dict[str, tuple[KDTree, np.ndarray]]:
"""Build per-type_group KD-trees of index-adjusted price-per-sqm.
Adjusts all pool properties' sale prices to ref_frac_year using the index,
then builds a KD-tree per type_group for nearest-neighbor queries.
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
"""
print("Building kNN pool...")
query = (
pl.scan_parquet(input_path)
.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
)
.filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
)
if max_sale_year is not None:
query = query.filter(
pl.col("Date of last transaction").dt.year() < max_sale_year
)
pool = (
query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (
pl.col("Date of last transaction").dt.month().cast(pl.Float64)
- 1.0
)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
)
pool = pool.filter(pl.col("type_group").is_not_null())
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")
# Interpolate log_index at sale date and reference date
pool = interpolate_log_index(
index, pool, "sector", "type_group", "_sale_fy", "_li_sale"
)
pool = interpolate_log_index(
index, pool, "sector", "type_group", "_ref_fy", "_li_ref"
)
# adjusted_psm = price / floor_area * exp(log_index_ref - log_index_sale)
pool = pool.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
* (pl.col("_li_ref") - pl.col("_li_sale")).exp()
).alias("_adj_psm")
).filter(
pl.col("_adj_psm").is_not_null(),
pl.col("_adj_psm").is_finite(),
pl.col("_adj_psm") > 0,
)
print(f" {len(pool):,} after index adjustment")
# Build per-type KD-trees
trees: dict[str, tuple[KDTree, np.ndarray]] = {}
for tg in TYPE_GROUPS:
sub = pool.filter(pl.col("type_group") == tg)
n = len(sub)
if n < KNN_MIN_NEIGHBORS:
continue
lat = sub["lat"].to_numpy().astype(np.float64)
lon = sub["lon"].to_numpy().astype(np.float64)
psm = sub["_adj_psm"].to_numpy().astype(np.float64)
tree = KDTree(_scale_coords(lat, lon))
trees[tg] = (tree, psm)
print(f" {tg}: {n:,}")
return trees
def knn_median_psm(
trees: dict[str, tuple[KDTree, np.ndarray]],
lat: np.ndarray,
lon: np.ndarray,
type_groups: np.ndarray,
k: int = KNN_K,
) -> np.ndarray:
"""Return median adjusted-PSM of k nearest neighbours for each target.
PSM is at the reference date used when building the pool.
NaN where not computable (missing coords, unknown type, too few neighbors).
"""
n = len(lat)
result = np.full(n, np.nan)
for tg, (tree, psm) in trees.items():
mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
idx = np.where(mask)[0]
if len(idx) == 0:
continue
actual_k = min(k, len(psm))
if actual_k < KNN_MIN_NEIGHBORS:
continue
coords = _scale_coords(lat[idx], lon[idx])
_, nn_idx = tree.query(coords, k=actual_k)
if nn_idx.ndim == 1:
nn_idx = nn_idx.reshape(-1, 1)
result[idx] = np.nanmedian(psm[nn_idx], axis=1)
return result

View file

@ -0,0 +1,140 @@
"""Hierarchical shrinkage and spatial smoothing for sector-level estimates."""
from typing import Callable, TypeVar
import numpy as np
from scipy.spatial import KDTree
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
V = TypeVar("V")
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
"""Shrink dict values toward parent using n/(n+k) weighting.
Works for any dict keyed by year or category.
"""
w = n / (n + SHRINKAGE_K)
result = {}
for key in set(raw) | set(parent):
r = raw.get(key, parent.get(key, 0.0))
p = parent.get(key, raw.get(key, 0.0))
result[key] = w * r + (1 - w) * p
return result
def hierarchical_shrinkage(
sector_vals: dict[str, V],
sector_n: dict[str, int],
district_vals: dict[str, V],
district_n: dict[str, int],
area_vals: dict[str, V],
area_n: dict[str, int],
top_level: V,
all_sectors: list[str],
sector_to_dist: dict[str, str],
dist_to_area: dict[str, str],
shrink_fn: Callable[[V, V, int], V],
) -> dict[str, V]:
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
"""
# Area -> top level
area_shrunk = {}
for area, val in area_vals.items():
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
# District -> area
district_shrunk = {}
for dist, val in district_vals.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, top_level)
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
# Sector -> district
sector_shrunk = {}
for sec, val in sector_vals.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, top_level)
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
# Fill sectors without their own values
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(d, area_shrunk.get(a, top_level))
return sector_shrunk
def spatial_smooth(
sector_values: dict[str, V],
centroids: dict[str, tuple[float, float]],
counts: dict[str, int],
blend_fn: Callable[[V, list[V], float, list[float]], V],
) -> dict[str, V]:
"""Blend sparse sector values with K nearest neighbors via KDTree."""
sectors_with_coords = [s for s in sector_values if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_values
coords = np.array([centroids[s] for s in sectors_with_coords])
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_values)
for i, sec in enumerate(sectors_with_coords):
n = counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_vals = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_values:
inv_dists.append(1.0 / d)
neighbor_vals.append(sector_values[ns])
if not neighbor_vals:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
neighbor_ws = [iw / total_inv * nbr_w for iw in inv_dists]
result[sec] = blend_fn(sector_values[sec], neighbor_vals, self_w, neighbor_ws)
return result
def blend_dicts(
self_val: dict, neighbor_vals: list[dict], self_w: float, neighbor_ws: list[float]
) -> dict:
"""Blend dict values by weighted sum across all keys."""
all_keys: set = set(self_val)
for nv in neighbor_vals:
all_keys |= set(nv)
result = {}
for k in all_keys:
val = self_w * self_val.get(k, 0.0)
for nv, w in zip(neighbor_vals, neighbor_ws):
val += w * nv.get(k, 0.0)
result[k] = val
return result

View file

@ -0,0 +1,233 @@
"""Shared utilities for price estimation modules."""
from datetime import date
from pathlib import Path
import numpy as np
import polars as pl
CURRENT_YEAR = 2026
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
CURRENT_MONTH = _today.month
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
TERRACE_TYPES = [
"Mid-Terrace",
"End-Terrace",
"Enclosed Mid-Terrace",
"Enclosed End-Terrace",
"Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
SHRINKAGE_K = 50
def type_group_expr():
"""Polars expression: Property type -> type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
.then(pl.lit("Terraced"))
.when(pl.col("Property type").is_in(FLAT_TYPES))
.then(pl.lit("Flats"))
.when(pl.col("Property type") == "Bungalow")
.then(pl.lit("Bungalow"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
.then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def sector_expr():
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
return (
pl.col("Postcode")
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
.str.strip_chars()
.alias("sector")
)
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
"""Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached)."""
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
tg = df["type_group"].to_numpy()
parts = [log_fa]
for t in NON_REF_TYPES:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
return np.hstack(parts)
def interpolate_log_index(
index: pl.DataFrame,
df: pl.DataFrame,
sector_col: str,
type_col: str,
frac_year_col: str,
output_alias: str,
) -> pl.DataFrame:
"""Join and interpolate log_index at fractional years.
For frac_year 2019.75: joins index at year=2019 and year=2020,
then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020.
Falls back to floor or ceil when the other is missing.
"""
floor_col = f"_{output_alias}_floor"
ceil_col = f"_{output_alias}_ceil"
floor_year = f"_{output_alias}_floor_year"
ceil_year = f"_{output_alias}_ceil_year"
frac_col = f"_{output_alias}_frac"
df = df.with_columns(
pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year),
pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year),
(pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col),
)
df = join_type_stratified_index(
df, index, sector_col, type_col, floor_year, floor_col
)
df = join_type_stratified_index(
df, index, sector_col, type_col, ceil_year, ceil_col
)
# Interpolate: (1-frac)*floor + frac*ceil, with fallbacks
df = df.with_columns(
pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null())
.then(
(1.0 - pl.col(frac_col)) * pl.col(floor_col)
+ pl.col(frac_col) * pl.col(ceil_col)
)
.when(pl.col(floor_col).is_not_null())
.then(pl.col(floor_col))
.when(pl.col(ceil_col).is_not_null())
.then(pl.col(ceil_col))
.otherwise(pl.lit(None))
.alias(output_alias),
).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col)
return df
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
"""Compute mean lat/lon per postcode sector."""
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids
def join_type_stratified_index(
df: pl.DataFrame,
index: pl.DataFrame,
sector_col: str,
type_col: str,
year_col: str,
output_alias: str,
) -> pl.DataFrame:
"""Join price index with typed->All fallback. Returns df with `output_alias` column."""
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
_typed = f"_{output_alias}_typed"
_all = f"_{output_alias}_all"
df = df.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias(_typed)
),
left_on=[sector_col, type_col, year_col],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias(_all)),
left_on=[sector_col, year_col],
right_on=["sector", "year"],
how="left",
)
df = df.with_columns(
pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias),
).drop(_typed, _all)
return df
def compute_seasonal_factors(
input_path: Path, max_sale_year: int | None = None
) -> np.ndarray:
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
Detrends by normalizing median £/sqm within each year, then averages
across years. Returns array of 12 factors (index 0 = January).
Normalized so mean = 1.0.
"""
query = (
pl.scan_parquet(input_path)
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Date of last transaction").is_not_null(),
)
.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
).alias("psm"),
pl.col("Date of last transaction").dt.month().alias("month"),
pl.col("Date of last transaction").dt.year().alias("year"),
)
)
if max_sale_year is not None:
query = query.filter(pl.col("year") < max_sale_year)
monthly = (
query.group_by("year", "month")
.agg(pl.col("psm").median().alias("median_psm"))
.with_columns(
pl.col("median_psm").mean().over("year").alias("year_mean"),
)
.with_columns(
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
)
.group_by("month")
.agg(pl.col("ratio").mean().alias("factor"))
.sort("month")
.collect()
)
factors = monthly["factor"].to_numpy().astype(np.float64)
return factors / factors.mean()

View file

@ -100,6 +100,7 @@ DROP_CATEGORIES = {
"building/entrance",
"building/entry",
"building/farm",
"building/farm_auxiliary",
"building/garage",
"building/garages",
"building/greenhouse",