lmao
This commit is contained in:
parent
03445188ea
commit
524580eb25
102 changed files with 36625 additions and 1295 deletions
417
pipeline/transform/add_online_listings.py
Normal file
417
pipeline/transform/add_online_listings.py
Normal file
|
|
@ -0,0 +1,417 @@
|
|||
"""Add online buy/rent listings to wide.parquet as new rows.
|
||||
|
||||
Matches online listings to existing historical rows by postcode + fuzzy address,
|
||||
carrying over historical prices and area-level data for matched properties.
|
||||
Unmatched listings get area data from any same-postcode row in wide.
|
||||
|
||||
Modifies wide.parquet in-place, adding:
|
||||
- A `Listing status` column to all rows ("Historical sale" / "For sale" / "For rent")
|
||||
- New columns: Asking price, Asking rent (monthly), Bedrooms, Bathrooms,
|
||||
Listing date, Property sub-type, Listing URL, Price qualifier
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from os import cpu_count
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from thefuzz import fuzz
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.fuzzy_join import _numbers_compatible
|
||||
|
||||
_NORMALIZE_RE = re.compile(r"[,.\-]")
|
||||
_WHITESPACE_RE = re.compile(r"\s+")
|
||||
|
||||
# Columns that are property-specific (carried from matched historical row only)
|
||||
_PROPERTY_COLUMNS = [
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"historical_prices",
|
||||
"renovation_history",
|
||||
"Construction age",
|
||||
"Is construction date approximate",
|
||||
"Current energy rating",
|
||||
"Potential energy rating",
|
||||
"Address per EPC",
|
||||
"Interior height (m)",
|
||||
"Number of bedrooms & living rooms",
|
||||
"Price per sqm",
|
||||
"Estimated current price",
|
||||
"Est. price per sqm",
|
||||
]
|
||||
|
||||
# Columns that are area-level (carried from matched row, or any same-postcode row)
|
||||
_AREA_COLUMNS = [
|
||||
"Public transport to Bank (mins)",
|
||||
"Cycling to Bank (mins)",
|
||||
"Public transport to Fitzrovia (mins)",
|
||||
"Cycling to Fitzrovia (mins)",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Education, Skills and Training Score",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Living Environment Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
"% Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
"% Other",
|
||||
"Estimated monthly rent",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Serious crime (avg/yr)",
|
||||
"Minor crime (avg/yr)",
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 2km",
|
||||
"Number of public transport stations within 2km",
|
||||
"Noise (dB)",
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
"Max available download speed (Mbps)",
|
||||
"Collapsible deposits risk",
|
||||
"Compressible ground risk",
|
||||
"Landslide risk",
|
||||
"Running sand risk",
|
||||
"Shrink-swell risk",
|
||||
"Soluble rocks risk",
|
||||
"Environmental risk",
|
||||
]
|
||||
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
return _WHITESPACE_RE.sub(" ", _NORMALIZE_RE.sub(" ", s.upper())).strip()
|
||||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
wide_entries, online_entries = args
|
||||
pairs = []
|
||||
for wide_idx, wide_address in wide_entries:
|
||||
for online_idx, online_address in online_entries:
|
||||
if not _numbers_compatible(wide_address, online_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(wide_address, online_address)
|
||||
pairs.append((score, online_idx, wide_idx))
|
||||
return pairs
|
||||
|
||||
|
||||
def _load_online(buy_path: Path, rent_path: Path) -> pl.DataFrame:
|
||||
"""Load buy + rent parquets, tag with channel, normalize rent to monthly."""
|
||||
buy = pl.scan_parquet(buy_path).with_columns(
|
||||
pl.lit("For sale").alias("_channel"),
|
||||
)
|
||||
rent = pl.scan_parquet(rent_path).with_columns(
|
||||
pl.lit("For rent").alias("_channel"),
|
||||
)
|
||||
|
||||
online = pl.concat([buy, rent]).collect()
|
||||
|
||||
# Normalize rent prices to monthly
|
||||
freq = online["price_frequency"]
|
||||
price = online["price"].cast(pl.Float64)
|
||||
monthly_price = (
|
||||
pl.when(freq == "weekly")
|
||||
.then(price * 52.0 / 12.0)
|
||||
.when(freq == "yearly")
|
||||
.then(price / 12.0)
|
||||
.when(freq == "daily")
|
||||
.then(price * 365.25 / 12.0)
|
||||
.when(freq == "quarterly")
|
||||
.then(price / 3.0)
|
||||
.otherwise(price) # monthly, not specified
|
||||
.round(0)
|
||||
.cast(pl.Int64)
|
||||
)
|
||||
|
||||
online = online.with_columns(
|
||||
pl.when(pl.col("_channel") == "For sale")
|
||||
.then(pl.col("price"))
|
||||
.otherwise(None)
|
||||
.alias("Asking price"),
|
||||
pl.when(pl.col("_channel") == "For rent")
|
||||
.then(monthly_price)
|
||||
.otherwise(None)
|
||||
.alias("Asking rent (monthly)"),
|
||||
)
|
||||
|
||||
return online
|
||||
|
||||
|
||||
def _match_online_to_wide(
|
||||
wide: pl.DataFrame,
|
||||
online: pl.DataFrame,
|
||||
) -> dict[int, int]:
|
||||
"""Match online listings to wide rows by postcode + fuzzy address.
|
||||
|
||||
Returns dict mapping online row index → wide row index.
|
||||
"""
|
||||
# Build postcode → [(row_idx, normalized_address)] for wide
|
||||
wide_postcodes = wide["Postcode"]
|
||||
wide_addresses = wide["Address per Property Register"]
|
||||
|
||||
wide_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for i in range(wide.height):
|
||||
pc = wide_postcodes[i]
|
||||
addr = wide_addresses[i]
|
||||
if pc is not None and addr is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
wide_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
|
||||
|
||||
# Build postcode → [(row_idx, normalized_address)] for online
|
||||
online_postcodes = online["postcode"]
|
||||
online_addresses = online["address"]
|
||||
|
||||
online_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for i in range(online.height):
|
||||
pc = online_postcodes[i]
|
||||
addr = online_addresses[i]
|
||||
if pc is not None and addr is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
online_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
|
||||
|
||||
# Build tasks: only postcodes present in both
|
||||
tasks = [
|
||||
(wide_by_postcode[pc], online_entries)
|
||||
for pc, online_entries in online_by_postcode.items()
|
||||
if pc in wide_by_postcode
|
||||
]
|
||||
|
||||
# Score in parallel
|
||||
all_pairs: list[tuple[int, int, int]] = []
|
||||
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
|
||||
for pairs in tqdm(
|
||||
executor.map(_score_bucket, tasks, chunksize=64),
|
||||
total=len(tasks),
|
||||
desc="Matching online listings",
|
||||
):
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
del tasks, wide_by_postcode, online_by_postcode
|
||||
|
||||
# Greedy assignment: best score first, one-to-one
|
||||
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
|
||||
|
||||
matches: dict[int, int] = {} # online_idx → wide_idx
|
||||
matched_wide: set[int] = set()
|
||||
|
||||
for _score, online_idx, wide_idx in all_pairs:
|
||||
if online_idx in matches or wide_idx in matched_wide:
|
||||
continue
|
||||
matches[online_idx] = wide_idx
|
||||
matched_wide.add(wide_idx)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def _build_postcode_area_lookup(wide: pl.DataFrame) -> dict[str, int]:
|
||||
"""Build postcode → first row index for area data fallback."""
|
||||
postcodes = wide["Postcode"]
|
||||
lookup: dict[str, int] = {}
|
||||
for i in range(wide.height):
|
||||
pc = postcodes[i]
|
||||
if pc is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
if pc_upper not in lookup:
|
||||
lookup[pc_upper] = i
|
||||
return lookup
|
||||
|
||||
|
||||
def _build_online_rows(
|
||||
wide: pl.DataFrame,
|
||||
online: pl.DataFrame,
|
||||
matches: dict[int, int],
|
||||
postcode_lookup: dict[str, int],
|
||||
) -> pl.DataFrame:
|
||||
"""Build a DataFrame of online listing rows with all wide.parquet columns."""
|
||||
wide_schema = wide.schema
|
||||
n = online.height
|
||||
|
||||
# Initialize all columns as null lists
|
||||
columns: dict[str, list] = {col: [None] * n for col in wide_schema}
|
||||
|
||||
# Add new columns
|
||||
columns["Listing status"] = [None] * n
|
||||
columns["Asking price"] = [None] * n
|
||||
columns["Asking rent (monthly)"] = [None] * n
|
||||
columns["Bedrooms"] = [None] * n
|
||||
columns["Bathrooms"] = [None] * n
|
||||
columns["Listing date"] = [None] * n
|
||||
columns["Property sub-type"] = [None] * n
|
||||
columns["Listing URL"] = [None] * n
|
||||
columns["Price qualifier"] = [None] * n
|
||||
|
||||
for i in range(n):
|
||||
# Direct mappings from online listing
|
||||
columns["Address per Property Register"][i] = online["address"][i]
|
||||
columns["Postcode"][i] = online["postcode"][i]
|
||||
columns["lat"][i] = online["latitude"][i]
|
||||
columns["lon"][i] = online["longitude"][i]
|
||||
columns["Property type"][i] = online["property_type"][i]
|
||||
columns["Leashold/Freehold"][i] = online["tenure"][i]
|
||||
columns["Total floor area (sqm)"][i] = online["floorspace_sqm"][i]
|
||||
|
||||
# New columns
|
||||
columns["Listing status"][i] = online["_channel"][i]
|
||||
columns["Asking price"][i] = online["Asking price"][i]
|
||||
columns["Asking rent (monthly)"][i] = online["Asking rent (monthly)"][i]
|
||||
columns["Bedrooms"][i] = online["bedrooms"][i]
|
||||
columns["Bathrooms"][i] = online["bathrooms"][i]
|
||||
columns["Property sub-type"][i] = online["property_sub_type"][i]
|
||||
columns["Listing URL"][i] = online["url"][i]
|
||||
columns["Price qualifier"][i] = online["price_qualifier"][i]
|
||||
|
||||
# Parse listing date
|
||||
fvd = online["first_visible_date"][i]
|
||||
if fvd is not None:
|
||||
try:
|
||||
from datetime import datetime
|
||||
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
columns["Listing date"][i] = dt.replace(tzinfo=None)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Determine source row for carried data
|
||||
matched_wide_idx = matches.get(i)
|
||||
postcode = online["postcode"][i]
|
||||
pc_upper = postcode.strip().upper() if postcode else None
|
||||
area_source_idx = matched_wide_idx
|
||||
if area_source_idx is None and pc_upper is not None:
|
||||
area_source_idx = postcode_lookup.get(pc_upper)
|
||||
|
||||
# Copy property-specific columns from matched row only
|
||||
if matched_wide_idx is not None:
|
||||
for col in _PROPERTY_COLUMNS:
|
||||
if col in wide_schema:
|
||||
columns[col][i] = wide[col][matched_wide_idx]
|
||||
|
||||
# Copy area columns from matched row or same-postcode fallback
|
||||
if area_source_idx is not None:
|
||||
for col in _AREA_COLUMNS:
|
||||
if col in wide_schema:
|
||||
columns[col][i] = wide[col][area_source_idx]
|
||||
|
||||
# Build DataFrame with correct types
|
||||
series_list = []
|
||||
for col_name, dtype in wide_schema.items():
|
||||
series_list.append(pl.Series(col_name, columns[col_name], dtype=dtype))
|
||||
|
||||
# New columns with their types
|
||||
series_list.append(
|
||||
pl.Series("Listing status", columns["Listing status"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Asking price", columns["Asking price"], dtype=pl.Int64)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series(
|
||||
"Asking rent (monthly)", columns["Asking rent (monthly)"], dtype=pl.Int64
|
||||
)
|
||||
)
|
||||
series_list.append(pl.Series("Bedrooms", columns["Bedrooms"], dtype=pl.Int32))
|
||||
series_list.append(pl.Series("Bathrooms", columns["Bathrooms"], dtype=pl.Int32))
|
||||
series_list.append(
|
||||
pl.Series("Listing date", columns["Listing date"], dtype=pl.Datetime("us"))
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Property sub-type", columns["Property sub-type"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Listing URL", columns["Listing URL"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Price qualifier", columns["Price qualifier"], dtype=pl.String)
|
||||
)
|
||||
|
||||
return pl.DataFrame(series_list)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Add online buy/rent listings to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="wide.parquet path (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--buy", type=Path, required=True, help="rightmove_buy.parquet path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rent", type=Path, required=True, help="rightmove_rent.parquet path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
wide = pl.read_parquet(args.input)
|
||||
print(f" {wide.height} rows, {wide.width} columns")
|
||||
|
||||
print("Loading online listings...")
|
||||
online = _load_online(args.buy, args.rent)
|
||||
print(
|
||||
f" {online.height} online listings ({online.filter(pl.col('_channel') == 'For sale').height} buy, {online.filter(pl.col('_channel') == 'For rent').height} rent)"
|
||||
)
|
||||
|
||||
print("Matching online listings to historical rows...")
|
||||
matches = _match_online_to_wide(wide, online)
|
||||
print(f" {len(matches)} online listings matched to historical rows")
|
||||
|
||||
print("Building postcode area lookup...")
|
||||
postcode_lookup = _build_postcode_area_lookup(wide)
|
||||
|
||||
print("Building online listing rows...")
|
||||
online_rows = _build_online_rows(wide, online, matches, postcode_lookup)
|
||||
print(f" {online_rows.height} online rows built")
|
||||
|
||||
# Add Listing status + new columns to existing wide rows
|
||||
wide = wide.with_columns(
|
||||
pl.lit("Historical sale").alias("Listing status"),
|
||||
pl.lit(None, dtype=pl.Int64).alias("Asking price"),
|
||||
pl.lit(None, dtype=pl.Int64).alias("Asking rent (monthly)"),
|
||||
pl.lit(None, dtype=pl.Int32).alias("Bedrooms"),
|
||||
pl.lit(None, dtype=pl.Int32).alias("Bathrooms"),
|
||||
pl.lit(None, dtype=pl.Datetime("us")).alias("Listing date"),
|
||||
pl.lit(None, dtype=pl.String).alias("Property sub-type"),
|
||||
pl.lit(None, dtype=pl.String).alias("Listing URL"),
|
||||
pl.lit(None, dtype=pl.String).alias("Price qualifier"),
|
||||
)
|
||||
|
||||
# Concat
|
||||
result = pl.concat([wide, online_rows], how="diagonal_relaxed")
|
||||
print(f"Final: {result.height} rows, {result.width} columns")
|
||||
|
||||
# Verify
|
||||
status_counts = (
|
||||
result["Listing status"].value_counts().sort("count", descending=True)
|
||||
)
|
||||
print(f"Listing status distribution:\n{status_counts}")
|
||||
|
||||
result.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.input} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -7,6 +7,7 @@ from ..utils import fuzzy_join_on_postcode
|
|||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -117,7 +118,9 @@ def main():
|
|||
.collect()
|
||||
)
|
||||
|
||||
event_counts = events["renovation_history"].explode().struct.field("event").value_counts()
|
||||
event_counts = (
|
||||
events["renovation_history"].explode().struct.field("event").value_counts()
|
||||
)
|
||||
print(f"Renovation events: {events.height} properties with events")
|
||||
print(event_counts)
|
||||
|
||||
|
|
@ -159,6 +162,7 @@ def main():
|
|||
"old_new",
|
||||
)
|
||||
.filter(pl.col("pp_property_type") != "Other")
|
||||
.filter(pl.col("price") >= MIN_PRICE)
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
|
|
@ -171,6 +175,7 @@ def main():
|
|||
.agg(
|
||||
pl.struct(
|
||||
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
|
||||
"price",
|
||||
).alias("historical_prices"),
|
||||
pl.col("pp_property_type").last(),
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@ import argparse
|
|||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
||||
MIN_PRICE = 10_000
|
||||
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
||||
|
||||
|
|
@ -45,20 +46,23 @@ def _build_wide(
|
|||
rental_prices_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
||||
wide = (
|
||||
pl.scan_parquet(epc_pp_path)
|
||||
.filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
)
|
||||
.filter(
|
||||
pl.col("latest_price").is_null() | (pl.col("latest_price") >= MIN_PRICE)
|
||||
)
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
)
|
||||
|
||||
# Remap terminated postcodes to nearest active successor
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
wide = wide.join(
|
||||
postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
|
||||
).with_columns(
|
||||
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
||||
).drop("new_postcode")
|
||||
|
||||
arcgis = (
|
||||
pl.scan_parquet(arcgis_path)
|
||||
.filter(pl.col("ctry") == "E92000001") # England only
|
||||
.filter(pl.col("doterm").is_null()) # Active postcodes only
|
||||
.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
|
|
@ -67,7 +71,7 @@ def _build_wide(
|
|||
"oa21",
|
||||
)
|
||||
)
|
||||
wide = wide.join(arcgis, on="postcode", how="full", coalesce=True)
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
|
||||
wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
|
||||
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
|
||||
|
|
@ -147,11 +151,6 @@ def _build_wide(
|
|||
.with_columns(
|
||||
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("noise_lden_db")
|
||||
.fill_null(pl.col("noise_lden_db").min())
|
||||
.alias("noise_lden_db"),
|
||||
)
|
||||
.select("postcode", "noise_lden_db")
|
||||
)
|
||||
wide = wide.join(noise, on="postcode", how="left")
|
||||
|
|
@ -181,7 +180,7 @@ def _build_wide(
|
|||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
)
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
|
||||
|
||||
geosure = pl.scan_parquet(geosure_path)
|
||||
wide = wide.join(geosure, on="postcode", how="left")
|
||||
|
|
|
|||
|
|
@ -8,11 +8,12 @@ import polars as pl
|
|||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
|
||||
# POI category groups for proximity counting
|
||||
# POI category groups for proximity counting.
|
||||
# Names must match the friendly names produced by transform_poi.py / naptan.py.
|
||||
POI_GROUPS = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
|
||||
"parks": ["Park", "Garden", "Nature Reserve"],
|
||||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
"parks": ["Park"],
|
||||
"public_transport": [
|
||||
"Metro or Tram stop",
|
||||
"Rail station",
|
||||
|
|
|
|||
0
pipeline/transform/price_estimation/__init__.py
Normal file
0
pipeline/transform/price_estimation/__init__.py
Normal file
292
pipeline/transform/price_estimation/backtest.py
Normal file
292
pipeline/transform/price_estimation/backtest.py
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
"""Backtest price estimation on held-out recent sales.
|
||||
|
||||
Uses temporal holdout: index built from pairs before TEST_YEAR_MIN only.
|
||||
Test set: properties with 2+ sales where the last sale >= TEST_YEAR_MIN.
|
||||
Evaluates: Naive vs Index vs kNN vs Blended.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.index import build_index
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
compute_seasonal_factors,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
TEST_YEAR_MIN = 2022
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
|
||||
print("Loading test set...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("year")
|
||||
.alias("actual_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("month")
|
||||
.alias("actual_month"),
|
||||
pl.col("historical_prices")
|
||||
.list.last()
|
||||
.struct.field("price")
|
||||
.alias("actual_price"),
|
||||
# Second-to-last sale (input)
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("year")
|
||||
.alias("input_year"),
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("month")
|
||||
.alias("input_month"),
|
||||
pl.col("historical_prices")
|
||||
.list.get(-2)
|
||||
.struct.field("price")
|
||||
.alias("input_price"),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("actual_year").cast(pl.Float64)
|
||||
+ (pl.col("actual_month").cast(pl.Float64) - 1.0) / 12.0
|
||||
).alias("actual_frac_year"),
|
||||
(
|
||||
pl.col("input_year").cast(pl.Float64)
|
||||
+ (pl.col("input_month").cast(pl.Float64) - 1.0) / 12.0
|
||||
).alias("input_frac_year"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("actual_year") >= TEST_YEAR_MIN,
|
||||
pl.col("input_price") > 0,
|
||||
pl.col("actual_price") > 0,
|
||||
pl.col("actual_frac_year") > pl.col("input_frac_year"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
|
||||
return df
|
||||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction with interpolation, capping, and seasonal adjustment."""
|
||||
test = interpolate_log_index(
|
||||
index, test, "sector", "type_group", "input_frac_year", "log_index_input"
|
||||
)
|
||||
test = interpolate_log_index(
|
||||
index, test, "sector", "type_group", "actual_frac_year", "log_index_actual"
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
pl.col("input_price").cast(pl.Float64)
|
||||
* (pl.col("log_index_actual") - pl.col("log_index_input"))
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
* pl.col("_seasonal_adj")
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
.alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
||||
ape = np.abs(predicted - actual) / actual
|
||||
signed_err = predicted - actual
|
||||
|
||||
return {
|
||||
"MdAPE (%)": float(np.median(ape) * 100),
|
||||
"% within 10%": float(np.mean(ape <= 0.10) * 100),
|
||||
"% within 20%": float(np.mean(ape <= 0.20) * 100),
|
||||
"% within 30%": float(np.mean(ape <= 0.30) * 100),
|
||||
"MAE (£)": float(np.mean(np.abs(signed_err))),
|
||||
"Mean signed error (£)": float(np.mean(signed_err)),
|
||||
"n": int(len(actual)),
|
||||
}
|
||||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
stages = list(metrics_by_stage.keys())
|
||||
col_w = 15
|
||||
width = 25 + col_w * len(stages)
|
||||
|
||||
print("\n" + "=" * width)
|
||||
print(f"BACKTEST RESULTS (holdout: sales >= {TEST_YEAR_MIN})")
|
||||
print("=" * width)
|
||||
|
||||
metric_names = [
|
||||
"MdAPE (%)",
|
||||
"% within 10%",
|
||||
"% within 20%",
|
||||
"% within 30%",
|
||||
"MAE (£)",
|
||||
"Mean signed error (£)",
|
||||
"n",
|
||||
]
|
||||
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>{col_w - 1}s}"
|
||||
print(header)
|
||||
print("-" * width)
|
||||
|
||||
for metric in metric_names:
|
||||
row = f"{metric:<25s}"
|
||||
for stage in stages:
|
||||
val = metrics_by_stage[stage][metric]
|
||||
if metric == "n":
|
||||
row += f" {val:>{col_w - 1},d}"
|
||||
elif "£" in metric:
|
||||
row += f" {val:>{col_w - 2},.0f}"
|
||||
else:
|
||||
row += f" {val:>{col_w - 2}.1f}%"
|
||||
print(row)
|
||||
|
||||
print("=" * width)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backtest price estimation model")
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output backtest_results.parquet"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build index from pre-test data only (temporal holdout)
|
||||
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
|
||||
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
|
||||
print(
|
||||
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
|
||||
# Compute seasonal factors from pre-test data only
|
||||
seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
|
||||
months = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
|
||||
]
|
||||
print(
|
||||
f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
|
||||
)
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
# Compute seasonal adjustment for each test pair
|
||||
input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
|
||||
actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
|
||||
seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
|
||||
test = test.with_columns(
|
||||
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
print("\nPredicting with price index...")
|
||||
test = predict(test, index)
|
||||
|
||||
# --- kNN ---
|
||||
ref_fy = float(TEST_YEAR_MIN)
|
||||
trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
|
||||
|
||||
# Interpolate log_index at reference year for temporal adjustment
|
||||
test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))
|
||||
test = interpolate_log_index(
|
||||
index, test, "sector", "type_group", "_ref_fy", "_log_index_ref"
|
||||
)
|
||||
|
||||
lat = test["lat"].cast(pl.Float64).to_numpy()
|
||||
lon = test["lon"].cast(pl.Float64).to_numpy()
|
||||
tg = test["type_group"].to_numpy()
|
||||
fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
|
||||
|
||||
print("\nComputing kNN estimates...")
|
||||
knn_psm = knn_median_psm(trees, lat, lon, tg)
|
||||
|
||||
# Temporal adjustment: pool PSM is at ref, adjust to actual
|
||||
log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)
|
||||
log_idx_ref = test["_log_index_ref"].to_numpy().astype(np.float64)
|
||||
temporal_adj = np.where(
|
||||
np.isfinite(log_idx_actual) & np.isfinite(log_idx_ref),
|
||||
np.exp(log_idx_actual - log_idx_ref),
|
||||
1.0,
|
||||
)
|
||||
knn_est = knn_psm * fa * temporal_adj
|
||||
|
||||
n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
|
||||
print(f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
|
||||
|
||||
# Blend: (1-w)*index + w*kNN where both available
|
||||
index_est = test["predicted"].to_numpy().astype(np.float64)
|
||||
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
|
||||
blended = np.where(
|
||||
knn_valid & np.isfinite(index_est),
|
||||
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
|
||||
np.where(np.isfinite(index_est), index_est, knn_est),
|
||||
)
|
||||
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
|
||||
metrics = {
|
||||
"Naive": compute_metrics(
|
||||
actual, test["input_price"].to_numpy().astype(np.float64)
|
||||
),
|
||||
"Index": compute_metrics(actual, index_est),
|
||||
"kNN": compute_metrics(actual, knn_est),
|
||||
"Blended": compute_metrics(actual, blended),
|
||||
}
|
||||
|
||||
print_metrics_table(metrics)
|
||||
|
||||
# Save results
|
||||
result = test.select(
|
||||
"Postcode",
|
||||
"sector",
|
||||
"input_year",
|
||||
"input_frac_year",
|
||||
"input_price",
|
||||
"actual_year",
|
||||
"actual_frac_year",
|
||||
"actual_price",
|
||||
"predicted",
|
||||
).with_columns(
|
||||
pl.Series("knn_predicted", knn_est, dtype=pl.Float64),
|
||||
pl.Series("blended", blended, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
204
pipeline/transform/price_estimation/estimate.py
Normal file
204
pipeline/transform/price_estimation/estimate.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""Augment wide.parquet with estimated current prices.
|
||||
|
||||
For properties with a known prior sale, applies the repeat-sales price index
|
||||
to adjust the last known price to the current date, then blends with kNN
|
||||
estimates from nearby recently-sold properties. Includes:
|
||||
- Capping extreme index adjustments
|
||||
- Seasonal month-of-sale adjustment
|
||||
- kNN spatial blending
|
||||
|
||||
Modifies wide.parquet in-place.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_FRAC_YEAR,
|
||||
CURRENT_MONTH,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
compute_seasonal_factors,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Augment wide.parquet with estimated current prices"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to wide.parquet (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Compute seasonal factors
|
||||
seasonal = compute_seasonal_factors(args.input)
|
||||
months = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
|
||||
]
|
||||
print(
|
||||
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
|
||||
)
|
||||
|
||||
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
|
||||
sale_month = (
|
||||
df["Date of last transaction"]
|
||||
.dt.month()
|
||||
.fill_null(6)
|
||||
.to_numpy()
|
||||
.astype(np.int32)
|
||||
)
|
||||
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
|
||||
|
||||
# Derive helper columns
|
||||
df = df.with_columns(
|
||||
sector_expr().alias("_sector"),
|
||||
(
|
||||
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
|
||||
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
|
||||
/ 12.0
|
||||
).alias("_sale_frac_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
|
||||
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(
|
||||
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
|
||||
print("\nApplying repeat-sales index with fractional year interpolation...")
|
||||
|
||||
df = interpolate_log_index(
|
||||
index, df, "_sector", "_type_group", "_sale_frac_year", "_log_index_sale_interp"
|
||||
)
|
||||
df = interpolate_log_index(
|
||||
index,
|
||||
df,
|
||||
"_sector",
|
||||
"_type_group",
|
||||
"_current_frac_year",
|
||||
"_log_index_current_interp",
|
||||
)
|
||||
|
||||
# Compute index-adjusted estimate with cap and seasonal adjustment
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
& pl.col("Date of last transaction").is_not_null()
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (
|
||||
pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
|
||||
)
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
* pl.col("_seasonal_adj")
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_estimated = df.filter(pl.col("Estimated current price").is_not_null()).height
|
||||
n_with_price = df.filter(has_price).height
|
||||
print(
|
||||
f" {n_estimated:,} of {n_with_price:,} properties estimated "
|
||||
f"({n_estimated / max(n_with_price, 1) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# --- kNN blending ---
|
||||
print("\nBuilding kNN estimates...")
|
||||
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
|
||||
|
||||
lat = df["lat"].cast(pl.Float64).to_numpy()
|
||||
lon = df["lon"].cast(pl.Float64).to_numpy()
|
||||
tg = df["_type_group"].fill_null("").to_numpy()
|
||||
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
|
||||
|
||||
knn_psm = knn_median_psm(trees, lat, lon, tg)
|
||||
knn_est = knn_psm * fa # No temporal adj: ref == current
|
||||
|
||||
df = df.with_columns(
|
||||
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
# Blend: where kNN available, use weighted average; else keep index
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
)
|
||||
.then(
|
||||
(1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
|
||||
+ KNN_BLEND_WEIGHT * pl.col("_knn_est")
|
||||
)
|
||||
.when(pl.col("Estimated current price").is_not_null())
|
||||
.then(pl.col("Estimated current price"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_blended = df.filter(
|
||||
pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
& pl.col("Estimated current price").is_not_null()
|
||||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
# Drop all temporary columns
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
df = df.drop(temp_cols)
|
||||
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
465
pipeline/transform/price_estimation/index.py
Normal file
465
pipeline/transform/price_estimation/index.py
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
"""Hierarchical repeat-sales price index.
|
||||
|
||||
Stratified by property type and postcode sector, with IRLS Huber regression,
|
||||
hierarchical shrinkage (sector → district → area → national → hedonic),
|
||||
and KD-tree spatial smoothing for sparse sectors.
|
||||
|
||||
Output: price_index.parquet — sector x type_group x year -> log_index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
hierarchy_keys,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
MIN_PAIRS = 5
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
|
||||
|
||||
def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFrame:
|
||||
"""Extract consecutive repeat-sale pairs.
|
||||
|
||||
If max_year2 is set, only pairs where year2 < max_year2 are included
|
||||
(for temporal holdout in backtesting).
|
||||
"""
|
||||
print("Extracting repeat-sale pairs...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
pl.col("historical_prices")
|
||||
.list.slice(0, pl.col("historical_prices").list.len() - 1)
|
||||
.alias("from_txn"),
|
||||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||||
)
|
||||
.explode("from_txn", "to_txn")
|
||||
.with_columns(
|
||||
pl.col("from_txn").struct.field("year").alias("year1"),
|
||||
pl.col("from_txn").struct.field("month").alias("month1"),
|
||||
pl.col("from_txn").struct.field("price").alias("price1"),
|
||||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("month").alias("month2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("year1").cast(pl.Float64)
|
||||
+ (pl.col("month1").cast(pl.Float64) - 1.0) / 12.0
|
||||
).alias("frac_year1"),
|
||||
(
|
||||
pl.col("year2").cast(pl.Float64)
|
||||
+ (pl.col("month2").cast(pl.Float64) - 1.0) / 12.0
|
||||
).alias("frac_year2"),
|
||||
)
|
||||
.select(
|
||||
"sector",
|
||||
"type_group",
|
||||
"year1",
|
||||
"price1",
|
||||
"year2",
|
||||
"price2",
|
||||
"frac_year1",
|
||||
"frac_year2",
|
||||
)
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("frac_year2") > pl.col("frac_year1"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
|
||||
.log()
|
||||
.alias("log_ratio"),
|
||||
(
|
||||
1.0
|
||||
/ (pl.col("frac_year2") - pl.col("frac_year1"))
|
||||
.cast(pl.Float64)
|
||||
.sqrt()
|
||||
).alias("weight"),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
|
||||
if max_year2 is not None:
|
||||
pairs = pairs.filter(pl.col("year2") < max_year2)
|
||||
|
||||
# Add hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
print(f" {len(pairs):,} pairs extracted")
|
||||
return pairs
|
||||
|
||||
|
||||
def solve_robust_index(
|
||||
years1: np.ndarray,
|
||||
years2: np.ndarray,
|
||||
log_ratios: np.ndarray,
|
||||
base_weights: np.ndarray,
|
||||
) -> dict[int, float]:
|
||||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||||
n = len(years1)
|
||||
if n < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
all_years = np.union1d(years1, years2)
|
||||
min_year = int(all_years.min())
|
||||
|
||||
col = 0
|
||||
year_to_col = {}
|
||||
for y in all_years:
|
||||
iy = int(y)
|
||||
if iy != min_year:
|
||||
year_to_col[iy] = col
|
||||
col += 1
|
||||
n_cols = len(year_to_col)
|
||||
if n_cols == 0:
|
||||
return {}
|
||||
|
||||
# Vectorized column index mapping
|
||||
col2 = np.full(n, -1, dtype=np.int32)
|
||||
col1 = np.full(n, -1, dtype=np.int32)
|
||||
for year, c in year_to_col.items():
|
||||
col2[years2 == year] = c
|
||||
col1[years1 == year] = c
|
||||
|
||||
# Sparse matrix structure (fixed across iterations)
|
||||
mask2 = col2 >= 0
|
||||
mask1 = col1 >= 0
|
||||
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
|
||||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
weights = base_weights.copy()
|
||||
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
predicted = np.zeros(n)
|
||||
predicted[mask2] += betas[col2[mask2]]
|
||||
predicted[mask1] -= betas[col1[mask1]]
|
||||
residuals = log_ratios - predicted
|
||||
|
||||
# Huber reweighting
|
||||
abs_r = np.abs(residuals)
|
||||
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
|
||||
weights = base_weights * huber_w
|
||||
|
||||
index = {min_year: 0.0}
|
||||
for year, c in year_to_col.items():
|
||||
index[year] = float(betas[c])
|
||||
return index
|
||||
|
||||
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"),
|
||||
pl.col("year2"),
|
||||
pl.col("log_ratio"),
|
||||
pl.col("weight"),
|
||||
)
|
||||
indices = {}
|
||||
n_pairs = {}
|
||||
for row in tqdm(
|
||||
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
|
||||
):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||||
w = np.array(row["weight"], dtype=np.float64)
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs[key] = len(y1)
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
def compute_hedonic_index(
|
||||
input_path: Path,
|
||||
min_year: int,
|
||||
max_year: int,
|
||||
max_sale_year: int | None = None,
|
||||
) -> dict[int, float]:
|
||||
"""Quality-adjusted hedonic index: regress log(price) on features, average residual by year.
|
||||
|
||||
Used as the ultimate shrinkage fallback for the repeat-sales index.
|
||||
If max_sale_year is set, only sales before that year are used (backtesting holdout).
|
||||
"""
|
||||
effective_max = max_sale_year - 1 if max_sale_year is not None else max_year
|
||||
print("Computing hedonic index...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"Property type",
|
||||
"Total floor area (sqm)",
|
||||
)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
pl.col("sale_year").is_not_null(),
|
||||
pl.col("sale_year") >= min_year,
|
||||
pl.col("sale_year") <= effective_max,
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} complete cases for hedonic model")
|
||||
|
||||
# Target
|
||||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||||
sale_years = df["sale_year"].to_numpy()
|
||||
|
||||
# Build feature matrix (5 hedonic features + intercept)
|
||||
X = build_hedonic_features(df)
|
||||
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
|
||||
print(f" Feature matrix: {F.shape[0]:,} x {F.shape[1]}")
|
||||
|
||||
# Step 1: regress log(price) on features -> quality score
|
||||
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
|
||||
quality_score = F.astype(np.float64) @ betas
|
||||
residuals = log_price - quality_score
|
||||
|
||||
# Step 2: average residual by year = hedonic index
|
||||
hedonic = {}
|
||||
for y in range(min_year, max_year + 1):
|
||||
mask = sale_years == y
|
||||
if mask.sum() > 0:
|
||||
hedonic[y] = float(np.mean(residuals[mask]))
|
||||
|
||||
# Normalize: min_year = 0
|
||||
base = hedonic.get(min_year, 0.0)
|
||||
for y in hedonic:
|
||||
hedonic[y] -= base
|
||||
|
||||
print(
|
||||
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
|
||||
)
|
||||
return hedonic
|
||||
|
||||
|
||||
EXTRAPOLATION_YEARS = 3
|
||||
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
"""Forward-fill missing years, with linear extrapolation beyond last known year."""
|
||||
if not index:
|
||||
return {y: 0.0 for y in range(min_year, max_year + 1)}
|
||||
|
||||
sorted_years = sorted(index.keys())
|
||||
last_known_year = sorted_years[-1]
|
||||
|
||||
# Forward fill up to last known year
|
||||
filled = {}
|
||||
last = 0.0
|
||||
for y in range(min_year, last_known_year + 1):
|
||||
if y in index:
|
||||
last = index[y]
|
||||
filled[y] = last
|
||||
|
||||
# Linear extrapolation beyond last known year
|
||||
if last_known_year < max_year:
|
||||
recent = [
|
||||
(y, index[y])
|
||||
for y in sorted_years
|
||||
if y >= last_known_year - EXTRAPOLATION_YEARS
|
||||
]
|
||||
if len(recent) >= 2:
|
||||
years_arr = np.array([r[0] for r in recent], dtype=np.float64)
|
||||
vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
|
||||
slope = np.polyfit(years_arr, vals_arr, 1)[0]
|
||||
for y in range(last_known_year + 1, max_year + 1):
|
||||
filled[y] = index[last_known_year] + slope * (y - last_known_year)
|
||||
else:
|
||||
for y in range(last_known_year + 1, max_year + 1):
|
||||
filled[y] = index[last_known_year]
|
||||
|
||||
return filled
|
||||
|
||||
|
||||
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
|
||||
"""Build the full price index from raw data.
|
||||
|
||||
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
|
||||
The index is still forward-filled to CURRENT_YEAR.
|
||||
"""
|
||||
pairs = extract_pairs(input_path, max_year2=max_pair_year)
|
||||
centroids = extract_centroids(input_path)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = CURRENT_YEAR
|
||||
|
||||
hedonic_idx = compute_hedonic_index(
|
||||
input_path, min_year, max_year, max_sale_year=max_pair_year
|
||||
)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
# Process each type group + "All"
|
||||
all_type_groups = ["All"] + TYPE_GROUPS
|
||||
final = {} # {type_group: {sector: {year: log_index}}}
|
||||
final_n = {} # {type_group: {sector: n_pairs}}
|
||||
|
||||
for tg in all_type_groups:
|
||||
print(f"\n--- {tg} ---")
|
||||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||||
if len(typed) < MIN_PAIRS:
|
||||
print(f" Skipping (only {len(typed)} pairs)")
|
||||
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
|
||||
final_n[tg] = {s: 0 for s in all_sectors}
|
||||
continue
|
||||
|
||||
print(f" {len(typed):,} pairs")
|
||||
|
||||
# National
|
||||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_robust_index(
|
||||
np_arrs["year1"].to_numpy(),
|
||||
np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(),
|
||||
np_arrs["weight"].to_numpy(),
|
||||
)
|
||||
national_n = len(typed)
|
||||
print(f" National: {len(national_idx)} years")
|
||||
|
||||
# Area, district, sector
|
||||
print(" Computing per-level indices:")
|
||||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||||
print(
|
||||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
sector_shrunk = hierarchical_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
district_idx,
|
||||
district_n,
|
||||
area_idx,
|
||||
area_n,
|
||||
national_shrunk,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
shrink_dicts,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
print(" Spatial smoothing...")
|
||||
sector_smoothed = spatial_smooth(
|
||||
sector_shrunk, centroids, sector_n, blend_dicts
|
||||
)
|
||||
|
||||
# Forward fill
|
||||
for sec in all_sectors:
|
||||
sector_smoothed[sec] = forward_fill(
|
||||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||||
)
|
||||
|
||||
final[tg] = sector_smoothed
|
||||
final_n[tg] = sector_n
|
||||
|
||||
# Assemble output
|
||||
print("\nAssembling output...")
|
||||
rows = []
|
||||
for tg in all_type_groups:
|
||||
for sec in all_sectors:
|
||||
n = final_n[tg].get(sec, 0)
|
||||
for year, log_idx in final[tg][sec].items():
|
||||
rows.append((sec, tg, year, log_idx, n))
|
||||
|
||||
return pl.DataFrame(
|
||||
rows,
|
||||
schema={
|
||||
"sector": pl.String,
|
||||
"type_group": pl.String,
|
||||
"year": pl.Int32,
|
||||
"log_index": pl.Float64,
|
||||
"n_pairs": pl.Int64,
|
||||
},
|
||||
orient="row",
|
||||
).sort("type_group", "sector", "year")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build improved repeat-sales price index"
|
||||
)
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = build_index(args.input)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {result['sector'].n_unique():,} sectors x {result['type_group'].n_unique()} types x {result['year'].n_unique()} years = {len(result):,} rows"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
161
pipeline/transform/price_estimation/knn.py
Normal file
161
pipeline/transform/price_estimation/knn.py
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
"""kNN price estimation using nearby recently-sold properties.
|
||||
|
||||
For each target property, finds k nearest sold properties of the same type,
|
||||
computes the median index-adjusted price-per-sqm, and multiplies by the
|
||||
target's floor area to produce an estimate.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import KDTree
|
||||
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
TYPE_GROUPS,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
KNN_K = 20
|
||||
KNN_MIN_NEIGHBORS = 5
|
||||
KNN_BLEND_WEIGHT = 0.35
|
||||
|
||||
|
||||
def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
|
||||
"""Equirectangular projection: scale lon by cos(lat) for approximate distances."""
|
||||
return np.column_stack([lat, lon * np.cos(np.radians(lat))])
|
||||
|
||||
|
||||
def build_knn_pool(
|
||||
input_path: Path,
|
||||
index: pl.DataFrame,
|
||||
ref_frac_year: float,
|
||||
max_sale_year: int | None = None,
|
||||
) -> dict[str, tuple[KDTree, np.ndarray]]:
|
||||
"""Build per-type_group KD-trees of index-adjusted price-per-sqm.
|
||||
|
||||
Adjusts all pool properties' sale prices to ref_frac_year using the index,
|
||||
then builds a KD-tree per type_group for nearest-neighbor queries.
|
||||
|
||||
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
|
||||
"""
|
||||
print("Building kNN pool...")
|
||||
query = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Postcode",
|
||||
"Property type",
|
||||
"lat",
|
||||
"lon",
|
||||
"Total floor area (sqm)",
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
)
|
||||
.filter(
|
||||
pl.col("lat").is_not_null(),
|
||||
pl.col("lon").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Last known price") > 0,
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("Date of last transaction").is_not_null(),
|
||||
)
|
||||
)
|
||||
if max_sale_year is not None:
|
||||
query = query.filter(
|
||||
pl.col("Date of last transaction").dt.year() < max_sale_year
|
||||
)
|
||||
|
||||
pool = (
|
||||
query.with_columns(
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
(
|
||||
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
|
||||
+ (
|
||||
pl.col("Date of last transaction").dt.month().cast(pl.Float64)
|
||||
- 1.0
|
||||
)
|
||||
/ 12.0
|
||||
).alias("_sale_fy"),
|
||||
pl.lit(ref_frac_year).alias("_ref_fy"),
|
||||
).collect()
|
||||
)
|
||||
pool = pool.filter(pl.col("type_group").is_not_null())
|
||||
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")
|
||||
|
||||
# Interpolate log_index at sale date and reference date
|
||||
pool = interpolate_log_index(
|
||||
index, pool, "sector", "type_group", "_sale_fy", "_li_sale"
|
||||
)
|
||||
pool = interpolate_log_index(
|
||||
index, pool, "sector", "type_group", "_ref_fy", "_li_ref"
|
||||
)
|
||||
|
||||
# adjusted_psm = price / floor_area * exp(log_index_ref - log_index_sale)
|
||||
pool = pool.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
|
||||
* (pl.col("_li_ref") - pl.col("_li_sale")).exp()
|
||||
).alias("_adj_psm")
|
||||
).filter(
|
||||
pl.col("_adj_psm").is_not_null(),
|
||||
pl.col("_adj_psm").is_finite(),
|
||||
pl.col("_adj_psm") > 0,
|
||||
)
|
||||
print(f" {len(pool):,} after index adjustment")
|
||||
|
||||
# Build per-type KD-trees
|
||||
trees: dict[str, tuple[KDTree, np.ndarray]] = {}
|
||||
for tg in TYPE_GROUPS:
|
||||
sub = pool.filter(pl.col("type_group") == tg)
|
||||
n = len(sub)
|
||||
if n < KNN_MIN_NEIGHBORS:
|
||||
continue
|
||||
lat = sub["lat"].to_numpy().astype(np.float64)
|
||||
lon = sub["lon"].to_numpy().astype(np.float64)
|
||||
psm = sub["_adj_psm"].to_numpy().astype(np.float64)
|
||||
tree = KDTree(_scale_coords(lat, lon))
|
||||
trees[tg] = (tree, psm)
|
||||
print(f" {tg}: {n:,}")
|
||||
|
||||
return trees
|
||||
|
||||
|
||||
def knn_median_psm(
|
||||
trees: dict[str, tuple[KDTree, np.ndarray]],
|
||||
lat: np.ndarray,
|
||||
lon: np.ndarray,
|
||||
type_groups: np.ndarray,
|
||||
k: int = KNN_K,
|
||||
) -> np.ndarray:
|
||||
"""Return median adjusted-PSM of k nearest neighbours for each target.
|
||||
|
||||
PSM is at the reference date used when building the pool.
|
||||
NaN where not computable (missing coords, unknown type, too few neighbors).
|
||||
"""
|
||||
n = len(lat)
|
||||
result = np.full(n, np.nan)
|
||||
|
||||
for tg, (tree, psm) in trees.items():
|
||||
mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
|
||||
idx = np.where(mask)[0]
|
||||
if len(idx) == 0:
|
||||
continue
|
||||
|
||||
actual_k = min(k, len(psm))
|
||||
if actual_k < KNN_MIN_NEIGHBORS:
|
||||
continue
|
||||
|
||||
coords = _scale_coords(lat[idx], lon[idx])
|
||||
_, nn_idx = tree.query(coords, k=actual_k)
|
||||
if nn_idx.ndim == 1:
|
||||
nn_idx = nn_idx.reshape(-1, 1)
|
||||
|
||||
result[idx] = np.nanmedian(psm[nn_idx], axis=1)
|
||||
|
||||
return result
|
||||
140
pipeline/transform/price_estimation/shrinkage.py
Normal file
140
pipeline/transform/price_estimation/shrinkage.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""Hierarchical shrinkage and spatial smoothing for sector-level estimates."""
|
||||
|
||||
from typing import Callable, TypeVar
|
||||
|
||||
import numpy as np
|
||||
from scipy.spatial import KDTree
|
||||
|
||||
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
|
||||
|
||||
V = TypeVar("V")
|
||||
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
"""Shrink dict values toward parent using n/(n+k) weighting.
|
||||
|
||||
Works for any dict keyed by year or category.
|
||||
"""
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
result = {}
|
||||
for key in set(raw) | set(parent):
|
||||
r = raw.get(key, parent.get(key, 0.0))
|
||||
p = parent.get(key, raw.get(key, 0.0))
|
||||
result[key] = w * r + (1 - w) * p
|
||||
return result
|
||||
|
||||
|
||||
def hierarchical_shrinkage(
|
||||
sector_vals: dict[str, V],
|
||||
sector_n: dict[str, int],
|
||||
district_vals: dict[str, V],
|
||||
district_n: dict[str, int],
|
||||
area_vals: dict[str, V],
|
||||
area_n: dict[str, int],
|
||||
top_level: V,
|
||||
all_sectors: list[str],
|
||||
sector_to_dist: dict[str, str],
|
||||
dist_to_area: dict[str, str],
|
||||
shrink_fn: Callable[[V, V, int], V],
|
||||
) -> dict[str, V]:
|
||||
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
|
||||
|
||||
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
|
||||
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
|
||||
"""
|
||||
# Area -> top level
|
||||
area_shrunk = {}
|
||||
for area, val in area_vals.items():
|
||||
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
|
||||
|
||||
# District -> area
|
||||
district_shrunk = {}
|
||||
for dist, val in district_vals.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, top_level)
|
||||
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
|
||||
|
||||
# Sector -> district
|
||||
sector_shrunk = {}
|
||||
for sec, val in sector_vals.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, top_level)
|
||||
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own values
|
||||
for sec in all_sectors:
|
||||
if sec not in sector_shrunk:
|
||||
d = sector_to_dist.get(sec, "")
|
||||
a = dist_to_area.get(d, "")
|
||||
sector_shrunk[sec] = district_shrunk.get(d, area_shrunk.get(a, top_level))
|
||||
|
||||
return sector_shrunk
|
||||
|
||||
|
||||
def spatial_smooth(
|
||||
sector_values: dict[str, V],
|
||||
centroids: dict[str, tuple[float, float]],
|
||||
counts: dict[str, int],
|
||||
blend_fn: Callable[[V, list[V], float, list[float]], V],
|
||||
) -> dict[str, V]:
|
||||
"""Blend sparse sector values with K nearest neighbors via KDTree."""
|
||||
sectors_with_coords = [s for s in sector_values if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_values
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_values)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = counts.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_vals = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_values:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_vals.append(sector_values[ns])
|
||||
|
||||
if not neighbor_vals:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
neighbor_ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||||
|
||||
result[sec] = blend_fn(sector_values[sec], neighbor_vals, self_w, neighbor_ws)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def blend_dicts(
|
||||
self_val: dict, neighbor_vals: list[dict], self_w: float, neighbor_ws: list[float]
|
||||
) -> dict:
|
||||
"""Blend dict values by weighted sum across all keys."""
|
||||
all_keys: set = set(self_val)
|
||||
for nv in neighbor_vals:
|
||||
all_keys |= set(nv)
|
||||
result = {}
|
||||
for k in all_keys:
|
||||
val = self_w * self_val.get(k, 0.0)
|
||||
for nv, w in zip(neighbor_vals, neighbor_ws):
|
||||
val += w * nv.get(k, 0.0)
|
||||
result[k] = val
|
||||
return result
|
||||
233
pipeline/transform/price_estimation/utils.py
Normal file
233
pipeline/transform/price_estimation/utils.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
"""Shared utilities for price estimation modules."""
|
||||
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2026
|
||||
_today = date.today()
|
||||
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
|
||||
CURRENT_MONTH = _today.month
|
||||
|
||||
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
|
||||
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
|
||||
TERRACE_TYPES = [
|
||||
"Mid-Terrace",
|
||||
"End-Terrace",
|
||||
"Enclosed Mid-Terrace",
|
||||
"Enclosed End-Terrace",
|
||||
"Terraced",
|
||||
]
|
||||
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
|
||||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
|
||||
SHRINKAGE_K = 50
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type -> type_group."""
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
|
||||
.then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type").is_in(FLAT_TYPES))
|
||||
.then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type") == "Bungalow")
|
||||
.then(pl.lit("Bungalow"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
|
||||
.then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def sector_expr():
|
||||
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
|
||||
return (
|
||||
pl.col("Postcode")
|
||||
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
|
||||
.str.strip_chars()
|
||||
.alias("sector")
|
||||
)
|
||||
|
||||
|
||||
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
||||
"""Return (district, area) for a sector string."""
|
||||
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area = ""
|
||||
for ch in district:
|
||||
if ch.isalpha():
|
||||
area += ch
|
||||
else:
|
||||
break
|
||||
return district, area
|
||||
|
||||
|
||||
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
|
||||
|
||||
|
||||
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
|
||||
"""Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached)."""
|
||||
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
||||
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
|
||||
tg = df["type_group"].to_numpy()
|
||||
parts = [log_fa]
|
||||
for t in NON_REF_TYPES:
|
||||
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
||||
return np.hstack(parts)
|
||||
|
||||
|
||||
def interpolate_log_index(
|
||||
index: pl.DataFrame,
|
||||
df: pl.DataFrame,
|
||||
sector_col: str,
|
||||
type_col: str,
|
||||
frac_year_col: str,
|
||||
output_alias: str,
|
||||
) -> pl.DataFrame:
|
||||
"""Join and interpolate log_index at fractional years.
|
||||
|
||||
For frac_year 2019.75: joins index at year=2019 and year=2020,
|
||||
then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020.
|
||||
Falls back to floor or ceil when the other is missing.
|
||||
"""
|
||||
floor_col = f"_{output_alias}_floor"
|
||||
ceil_col = f"_{output_alias}_ceil"
|
||||
floor_year = f"_{output_alias}_floor_year"
|
||||
ceil_year = f"_{output_alias}_ceil_year"
|
||||
frac_col = f"_{output_alias}_frac"
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year),
|
||||
pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year),
|
||||
(pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col),
|
||||
)
|
||||
|
||||
df = join_type_stratified_index(
|
||||
df, index, sector_col, type_col, floor_year, floor_col
|
||||
)
|
||||
df = join_type_stratified_index(
|
||||
df, index, sector_col, type_col, ceil_year, ceil_col
|
||||
)
|
||||
|
||||
# Interpolate: (1-frac)*floor + frac*ceil, with fallbacks
|
||||
df = df.with_columns(
|
||||
pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null())
|
||||
.then(
|
||||
(1.0 - pl.col(frac_col)) * pl.col(floor_col)
|
||||
+ pl.col(frac_col) * pl.col(ceil_col)
|
||||
)
|
||||
.when(pl.col(floor_col).is_not_null())
|
||||
.then(pl.col(floor_col))
|
||||
.when(pl.col(ceil_col).is_not_null())
|
||||
.then(pl.col(ceil_col))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias(output_alias),
|
||||
).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
|
||||
"""Compute mean lat/lon per postcode sector."""
|
||||
print("Computing sector centroids...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "lat", "lon")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
||||
.with_columns(sector_expr())
|
||||
.group_by("sector")
|
||||
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
||||
.collect()
|
||||
)
|
||||
centroids = {}
|
||||
for row in df.iter_rows(named=True):
|
||||
centroids[row["sector"]] = (row["lat"], row["lon"])
|
||||
print(f" {len(centroids):,} sector centroids")
|
||||
return centroids
|
||||
|
||||
|
||||
def join_type_stratified_index(
|
||||
df: pl.DataFrame,
|
||||
index: pl.DataFrame,
|
||||
sector_col: str,
|
||||
type_col: str,
|
||||
year_col: str,
|
||||
output_alias: str,
|
||||
) -> pl.DataFrame:
|
||||
"""Join price index with typed->All fallback. Returns df with `output_alias` column."""
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
_typed = f"_{output_alias}_typed"
|
||||
_all = f"_{output_alias}_all"
|
||||
|
||||
df = df.join(
|
||||
idx_typed.select(
|
||||
"sector", "type_group", "year", pl.col("log_index").alias(_typed)
|
||||
),
|
||||
left_on=[sector_col, type_col, year_col],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
).join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias(_all)),
|
||||
left_on=[sector_col, year_col],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias),
|
||||
).drop(_typed, _all)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def compute_seasonal_factors(
|
||||
input_path: Path, max_sale_year: int | None = None
|
||||
) -> np.ndarray:
|
||||
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
|
||||
|
||||
Detrends by normalizing median £/sqm within each year, then averages
|
||||
across years. Returns array of 12 factors (index 0 = January).
|
||||
Normalized so mean = 1.0.
|
||||
"""
|
||||
query = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Last known price") > 0,
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Date of last transaction").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
|
||||
).alias("psm"),
|
||||
pl.col("Date of last transaction").dt.month().alias("month"),
|
||||
pl.col("Date of last transaction").dt.year().alias("year"),
|
||||
)
|
||||
)
|
||||
if max_sale_year is not None:
|
||||
query = query.filter(pl.col("year") < max_sale_year)
|
||||
|
||||
monthly = (
|
||||
query.group_by("year", "month")
|
||||
.agg(pl.col("psm").median().alias("median_psm"))
|
||||
.with_columns(
|
||||
pl.col("median_psm").mean().over("year").alias("year_mean"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
|
||||
)
|
||||
.group_by("month")
|
||||
.agg(pl.col("ratio").mean().alias("factor"))
|
||||
.sort("month")
|
||||
.collect()
|
||||
)
|
||||
|
||||
factors = monthly["factor"].to_numpy().astype(np.float64)
|
||||
return factors / factors.mean()
|
||||
|
|
@ -100,6 +100,7 @@ DROP_CATEGORIES = {
|
|||
"building/entrance",
|
||||
"building/entry",
|
||||
"building/farm",
|
||||
"building/farm_auxiliary",
|
||||
"building/garage",
|
||||
"building/garages",
|
||||
"building/greenhouse",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue