This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -1,417 +0,0 @@
"""Add online buy/rent listings to wide.parquet as new rows.
Matches online listings to existing historical rows by postcode + fuzzy address,
carrying over historical prices and area-level data for matched properties.
Unmatched listings get area data from any same-postcode row in wide.
Modifies wide.parquet in-place, adding:
- A `Listing status` column to all rows ("Historical sale" / "For sale" / "For rent")
- New columns: Asking price, Asking rent (monthly), Bedrooms, Bathrooms,
Listing date, Property sub-type, Listing URL, Price qualifier
"""
import argparse
import re
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.utils.fuzzy_join import _numbers_compatible
_NORMALIZE_RE = re.compile(r"[,.\-]")
_WHITESPACE_RE = re.compile(r"\s+")
# Columns that are property-specific (carried from matched historical row only)
_PROPERTY_COLUMNS = [
"Last known price",
"Date of last transaction",
"historical_prices",
"renovation_history",
"Construction age",
"Is construction date approximate",
"Current energy rating",
"Potential energy rating",
"Address per EPC",
"Interior height (m)",
"Number of bedrooms & living rooms",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
# Columns that are area-level (carried from matched row, or any same-postcode row)
_AREA_COLUMNS = [
"Public transport to Bank (mins)",
"Cycling to Bank (mins)",
"Public transport to Fitzrovia (mins)",
"Cycling to Fitzrovia (mins)",
"Income Score (rate)",
"Employment Score (rate)",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
"% Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
"Estimated monthly rent",
"Criminal damage and arson (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Drugs (avg/yr)",
"Anti-social behaviour (avg/yr)",
"Public order (avg/yr)",
"Other crime (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Theft from the person (avg/yr)",
"Possession of weapons (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Robbery (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of public transport stations within 2km",
"Noise (dB)",
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Max available download speed (Mbps)",
"Collapsible deposits risk",
"Compressible ground risk",
"Landslide risk",
"Running sand risk",
"Shrink-swell risk",
"Soluble rocks risk",
"Environmental risk",
]
def _normalize(s: str) -> str:
return _WHITESPACE_RE.sub(" ", _NORMALIZE_RE.sub(" ", s.upper())).strip()
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
wide_entries, online_entries = args
pairs = []
for wide_idx, wide_address in wide_entries:
for online_idx, online_address in online_entries:
if not _numbers_compatible(wide_address, online_address):
continue
score = fuzz.token_sort_ratio(wide_address, online_address)
pairs.append((score, online_idx, wide_idx))
return pairs
def _load_online(buy_path: Path, rent_path: Path) -> pl.DataFrame:
"""Load buy + rent parquets, tag with channel, normalize rent to monthly."""
buy = pl.scan_parquet(buy_path).with_columns(
pl.lit("For sale").alias("_channel"),
)
rent = pl.scan_parquet(rent_path).with_columns(
pl.lit("For rent").alias("_channel"),
)
online = pl.concat([buy, rent]).collect()
# Normalize rent prices to monthly
freq = online["price_frequency"]
price = online["price"].cast(pl.Float64)
monthly_price = (
pl.when(freq == "weekly")
.then(price * 52.0 / 12.0)
.when(freq == "yearly")
.then(price / 12.0)
.when(freq == "daily")
.then(price * 365.25 / 12.0)
.when(freq == "quarterly")
.then(price / 3.0)
.otherwise(price) # monthly, not specified
.round(0)
.cast(pl.Int64)
)
online = online.with_columns(
pl.when(pl.col("_channel") == "For sale")
.then(pl.col("price"))
.otherwise(None)
.alias("Asking price"),
pl.when(pl.col("_channel") == "For rent")
.then(monthly_price)
.otherwise(None)
.alias("Asking rent (monthly)"),
)
return online
def _match_online_to_wide(
wide: pl.DataFrame,
online: pl.DataFrame,
) -> dict[int, int]:
"""Match online listings to wide rows by postcode + fuzzy address.
Returns dict mapping online row index wide row index.
"""
# Build postcode → [(row_idx, normalized_address)] for wide
wide_postcodes = wide["Postcode"]
wide_addresses = wide["Address per Property Register"]
wide_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(wide.height):
pc = wide_postcodes[i]
addr = wide_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
wide_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build postcode → [(row_idx, normalized_address)] for online
online_postcodes = online["postcode"]
online_addresses = online["address"]
online_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(online.height):
pc = online_postcodes[i]
addr = online_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
online_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build tasks: only postcodes present in both
tasks = [
(wide_by_postcode[pc], online_entries)
for pc, online_entries in online_by_postcode.items()
if pc in wide_by_postcode
]
# Score in parallel
all_pairs: list[tuple[int, int, int]] = []
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc="Matching online listings",
):
all_pairs.extend(pairs)
del tasks, wide_by_postcode, online_by_postcode
# Greedy assignment: best score first, one-to-one
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
matches: dict[int, int] = {} # online_idx → wide_idx
matched_wide: set[int] = set()
for _score, online_idx, wide_idx in all_pairs:
if online_idx in matches or wide_idx in matched_wide:
continue
matches[online_idx] = wide_idx
matched_wide.add(wide_idx)
return matches
def _build_postcode_area_lookup(wide: pl.DataFrame) -> dict[str, int]:
"""Build postcode → first row index for area data fallback."""
postcodes = wide["Postcode"]
lookup: dict[str, int] = {}
for i in range(wide.height):
pc = postcodes[i]
if pc is not None:
pc_upper = pc.strip().upper()
if pc_upper not in lookup:
lookup[pc_upper] = i
return lookup
def _build_online_rows(
wide: pl.DataFrame,
online: pl.DataFrame,
matches: dict[int, int],
postcode_lookup: dict[str, int],
) -> pl.DataFrame:
"""Build a DataFrame of online listing rows with all wide.parquet columns."""
wide_schema = wide.schema
n = online.height
# Initialize all columns as null lists
columns: dict[str, list] = {col: [None] * n for col in wide_schema}
# Add new columns
columns["Listing status"] = [None] * n
columns["Asking price"] = [None] * n
columns["Asking rent (monthly)"] = [None] * n
columns["Bedrooms"] = [None] * n
columns["Bathrooms"] = [None] * n
columns["Listing date"] = [None] * n
columns["Property sub-type"] = [None] * n
columns["Listing URL"] = [None] * n
columns["Price qualifier"] = [None] * n
for i in range(n):
# Direct mappings from online listing
columns["Address per Property Register"][i] = online["address"][i]
columns["Postcode"][i] = online["postcode"][i]
columns["lat"][i] = online["latitude"][i]
columns["lon"][i] = online["longitude"][i]
columns["Property type"][i] = online["property_type"][i]
columns["Leashold/Freehold"][i] = online["tenure"][i]
columns["Total floor area (sqm)"][i] = online["floorspace_sqm"][i]
# New columns
columns["Listing status"][i] = online["_channel"][i]
columns["Asking price"][i] = online["Asking price"][i]
columns["Asking rent (monthly)"][i] = online["Asking rent (monthly)"][i]
columns["Bedrooms"][i] = online["bedrooms"][i]
columns["Bathrooms"][i] = online["bathrooms"][i]
columns["Property sub-type"][i] = online["property_sub_type"][i]
columns["Listing URL"][i] = online["url"][i]
columns["Price qualifier"][i] = online["price_qualifier"][i]
# Parse listing date
fvd = online["first_visible_date"][i]
if fvd is not None:
try:
from datetime import datetime
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
columns["Listing date"][i] = dt.replace(tzinfo=None)
except (ValueError, TypeError):
pass
# Determine source row for carried data
matched_wide_idx = matches.get(i)
postcode = online["postcode"][i]
pc_upper = postcode.strip().upper() if postcode else None
area_source_idx = matched_wide_idx
if area_source_idx is None and pc_upper is not None:
area_source_idx = postcode_lookup.get(pc_upper)
# Copy property-specific columns from matched row only
if matched_wide_idx is not None:
for col in _PROPERTY_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][matched_wide_idx]
# Copy area columns from matched row or same-postcode fallback
if area_source_idx is not None:
for col in _AREA_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][area_source_idx]
# Build DataFrame with correct types
series_list = []
for col_name, dtype in wide_schema.items():
series_list.append(pl.Series(col_name, columns[col_name], dtype=dtype))
# New columns with their types
series_list.append(
pl.Series("Listing status", columns["Listing status"], dtype=pl.String)
)
series_list.append(
pl.Series("Asking price", columns["Asking price"], dtype=pl.Int64)
)
series_list.append(
pl.Series(
"Asking rent (monthly)", columns["Asking rent (monthly)"], dtype=pl.Int64
)
)
series_list.append(pl.Series("Bedrooms", columns["Bedrooms"], dtype=pl.Int32))
series_list.append(pl.Series("Bathrooms", columns["Bathrooms"], dtype=pl.Int32))
series_list.append(
pl.Series("Listing date", columns["Listing date"], dtype=pl.Datetime("us"))
)
series_list.append(
pl.Series("Property sub-type", columns["Property sub-type"], dtype=pl.String)
)
series_list.append(
pl.Series("Listing URL", columns["Listing URL"], dtype=pl.String)
)
series_list.append(
pl.Series("Price qualifier", columns["Price qualifier"], dtype=pl.String)
)
return pl.DataFrame(series_list)
def main():
parser = argparse.ArgumentParser(
description="Add online buy/rent listings to wide.parquet"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="wide.parquet path (modified in-place)",
)
parser.add_argument(
"--buy", type=Path, required=True, help="rightmove_buy.parquet path"
)
parser.add_argument(
"--rent", type=Path, required=True, help="rightmove_rent.parquet path"
)
args = parser.parse_args()
print("Loading wide.parquet...")
wide = pl.read_parquet(args.input)
print(f" {wide.height} rows, {wide.width} columns")
print("Loading online listings...")
online = _load_online(args.buy, args.rent)
print(
f" {online.height} online listings ({online.filter(pl.col('_channel') == 'For sale').height} buy, {online.filter(pl.col('_channel') == 'For rent').height} rent)"
)
print("Matching online listings to historical rows...")
matches = _match_online_to_wide(wide, online)
print(f" {len(matches)} online listings matched to historical rows")
print("Building postcode area lookup...")
postcode_lookup = _build_postcode_area_lookup(wide)
print("Building online listing rows...")
online_rows = _build_online_rows(wide, online, matches, postcode_lookup)
print(f" {online_rows.height} online rows built")
# Add Listing status + new columns to existing wide rows
wide = wide.with_columns(
pl.lit("Historical sale").alias("Listing status"),
pl.lit(None, dtype=pl.Int64).alias("Asking price"),
pl.lit(None, dtype=pl.Int64).alias("Asking rent (monthly)"),
pl.lit(None, dtype=pl.Int32).alias("Bedrooms"),
pl.lit(None, dtype=pl.Int32).alias("Bathrooms"),
pl.lit(None, dtype=pl.Datetime("us")).alias("Listing date"),
pl.lit(None, dtype=pl.String).alias("Property sub-type"),
pl.lit(None, dtype=pl.String).alias("Listing URL"),
pl.lit(None, dtype=pl.String).alias("Price qualifier"),
)
# Concat
result = pl.concat([wide, online_rows], how="diagonal_relaxed")
print(f"Final: {result.height} rows, {result.width} columns")
# Verify
status_counts = (
result["Listing status"].value_counts().sort("count", descending=True)
)
print(f"Listing status distribution:\n{status_counts}")
result.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"Wrote {args.input} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -30,7 +30,69 @@ def _join_journey_times(
return wide.join(journey_times, on="postcode", how="left")
def _build_wide(
_AREA_COLUMNS = [
"Postcode",
"lat",
"lon",
# Transport
"Public transport to Bank (mins)",
"Cycling to Bank (mins)",
"Public transport to Fitzrovia (mins)",
"Cycling to Fitzrovia (mins)",
# Deprivation
"Income Score (rate)",
"Employment Score (rate)",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
# Ethnicity
"% Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
# Crime
"Anti-social behaviour (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Robbery (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Drugs (avg/yr)",
"Possession of weapons (avg/yr)",
"Public order (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of public transport stations within 2km",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
# GeoSure
"Environmental risk",
"Collapsible deposits risk",
"Compressible ground risk",
"Landslide risk",
"Running sand risk",
"Shrink-swell risk",
"Soluble rocks risk",
]
def _build(
epc_pp_path: Path,
arcgis_path: Path,
iod_path: Path,
@ -44,8 +106,11 @@ def _build_wide(
broadband_path: Path,
geosure_path: Path,
rental_prices_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
Returns (postcode_df, properties_df).
"""
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -180,7 +245,7 @@ def _build_wide(
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")
@ -280,7 +345,18 @@ def _build_wide(
)
print("Collecting with streaming engine...")
return wide.collect(engine="streaming")
df = wide.collect(engine="streaming")
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
return postcode_df, properties_df
def main():
@ -356,11 +432,14 @@ def main():
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
)
parser.add_argument(
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
)
args = parser.parse_args()
wide = _build_wide(
postcode_df, properties_df = _build(
epc_pp_path=args.epc_pp,
arcgis_path=args.arcgis,
iod_path=args.iod,
@ -376,13 +455,17 @@ def main():
rental_prices_path=args.rental_prices,
)
print(f"Columns: {wide.columns}")
print(f"Rows: {wide.height}")
print(f"\nPostcode columns: {postcode_df.columns}")
print(f"Postcode rows: {postcode_df.height}")
postcode_df.write_parquet(args.output_postcodes)
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
wide.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
print(f"\nProperty columns: {properties_df.columns}")
print(f"Property rows: {properties_df.height}")
properties_df.write_parquet(args.output_properties)
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
if __name__ == "__main__":

View file

@ -20,7 +20,6 @@ from pipeline.transform.price_estimation.knn import (
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
@ -91,7 +90,7 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction with interpolation, capping, and seasonal adjustment."""
"""Index-based prediction with interpolation and capping."""
test = interpolate_log_index(
index, test, "sector", "type_group", "input_frac_year", "log_index_input"
)
@ -105,7 +104,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
* (pl.col("log_index_actual") - pl.col("log_index_input"))
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
@ -175,7 +173,10 @@ def print_metrics_table(metrics_by_stage: dict):
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
"--input", type=Path, required=True, help="Path to properties.parquet"
)
parser.add_argument(
"--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
@ -184,38 +185,28 @@ def main():
# Build index from pre-test data only (temporal holdout)
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
print(
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
# Compute seasonal factors from pre-test data only
seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
test = extract_test_set(args.input)
# Compute seasonal adjustment for each test pair
input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
test = test.with_columns(
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
# Join lat/lon from postcode.parquet (properties.parquet no longer has them)
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
test = test.join(postcodes, on="Postcode", how="left")
print("\nPredicting with price index...")
test = predict(test, index)
# --- kNN ---
ref_fy = float(TEST_YEAR_MIN)
trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
# Pass joined LazyFrame (with lat/lon) instead of raw properties path
pool_lf = pl.scan_parquet(args.input).join(
postcodes.lazy(), on="Postcode", how="left"
)
trees = build_knn_pool(pool_lf, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
# Interpolate log_index at reference year for temporal adjustment
test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))

View file

@ -1,19 +1,18 @@
"""Augment wide.parquet with estimated current prices.
"""Augment properties.parquet with estimated current prices.
For properties with a known prior sale, applies the repeat-sales price index
to adjust the last known price to the current date, then blends with kNN
estimates from nearby recently-sold properties. Includes:
- Capping extreme index adjustments
- Seasonal month-of-sale adjustment
- kNN spatial blending
Modifies wide.parquet in-place.
Modifies properties.parquet in-place. Temporarily joins postcode.parquet
for lat/lon needed by kNN, then drops those columns before writing.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
@ -23,9 +22,7 @@ from pipeline.transform.price_estimation.knn import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
CURRENT_MONTH,
MAX_LOG_ADJUSTMENT,
compute_seasonal_factors,
interpolate_log_index,
sector_expr,
type_group_expr,
@ -34,48 +31,39 @@ from pipeline.transform.price_estimation.utils import (
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
description="Augment properties.parquet with estimated current prices"
)
parser.add_argument(
"--input",
"--properties",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
help="Path to properties.parquet (modified in-place)",
)
parser.add_argument(
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon needed by kNN)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print("Loading properties.parquet...")
df = pl.read_parquet(args.properties)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Join lat/lon from postcode.parquet for kNN spatial queries
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
df = df.join(postcodes, on="Postcode", how="left")
print(f" Joined lat/lon from {len(postcodes):,} postcodes")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Compute seasonal factors
seasonal = compute_seasonal_factors(args.input)
months = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
print(
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
)
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
sale_month = (
df["Date of last transaction"]
.dt.month()
.fill_null(6)
.to_numpy()
.astype(np.int32)
)
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
# Derive helper columns
df = df.with_columns(
sector_expr().alias("_sector"),
@ -86,7 +74,6 @@ def main():
).alias("_sale_frac_year"),
type_group_expr().alias("_type_group"),
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
)
index = pl.read_parquet(args.index)
@ -109,7 +96,7 @@ def main():
"_log_index_current_interp",
)
# Compute index-adjusted estimate with cap and seasonal adjustment
# Compute index-adjusted estimate with cap
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
@ -125,7 +112,6 @@ def main():
)
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
* pl.col("_seasonal_adj")
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
@ -140,7 +126,7 @@ def main():
# --- kNN blending ---
print("\nBuilding kNN estimates...")
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
trees = build_knn_pool(df.lazy(), index, CURRENT_FRAC_YEAR)
lat = df["lat"].cast(pl.Float64).to_numpy()
lon = df["lon"].cast(pl.Float64).to_numpy()
@ -188,13 +174,13 @@ def main():
.alias("Est. price per sqm"),
)
# Drop all temporary columns
# Drop all temporary columns and joined lat/lon (those belong in postcode.parquet)
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols)
df = df.drop(temp_cols).drop("lat", "lon")
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
df.write_parquet(args.properties)
size_mb = args.properties.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.properties} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)

View file

@ -328,14 +328,19 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
return filled
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
def build_index(
input_path: Path,
max_pair_year: int | None = None,
postcodes_path: Path | None = None,
) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
centroids = extract_centroids(input_path)
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
@ -448,10 +453,12 @@ def main():
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--postcodes", type=Path, required=True,
help="Path to postcode.parquet (for lat/lon centroids)")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
result = build_index(args.input)
result = build_index(args.input, postcodes_path=args.postcodes)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -29,7 +29,7 @@ def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
def build_knn_pool(
input_path: Path,
source: Path | pl.LazyFrame,
index: pl.DataFrame,
ref_frac_year: float,
max_sale_year: int | None = None,
@ -42,8 +42,9 @@ def build_knn_pool(
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
query = (
pl.scan_parquet(input_path)
lf
.select(
"Postcode",
"Property type",

View file

@ -1,7 +1,6 @@
"""Shared utilities for price estimation modules."""
from datetime import date
from pathlib import Path
import numpy as np
import polars as pl
@ -9,7 +8,6 @@ import polars as pl
CURRENT_YEAR = 2026
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
CURRENT_MONTH = _today.month
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
@ -181,53 +179,3 @@ def join_type_stratified_index(
).drop(_typed, _all)
return df
def compute_seasonal_factors(
input_path: Path, max_sale_year: int | None = None
) -> np.ndarray:
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
Detrends by normalizing median £/sqm within each year, then averages
across years. Returns array of 12 factors (index 0 = January).
Normalized so mean = 1.0.
"""
query = (
pl.scan_parquet(input_path)
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Date of last transaction").is_not_null(),
)
.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
).alias("psm"),
pl.col("Date of last transaction").dt.month().alias("month"),
pl.col("Date of last transaction").dt.year().alias("year"),
)
)
if max_sale_year is not None:
query = query.filter(pl.col("year") < max_sale_year)
monthly = (
query.group_by("year", "month")
.agg(pl.col("psm").median().alias("median_psm"))
.with_columns(
pl.col("median_psm").mean().over("year").alias("year_mean"),
)
.with_columns(
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
)
.group_by("month")
.agg(pl.col("ratio").mean().alias("factor"))
.sort("month")
.collect()
)
factors = monthly["factor"].to_numpy().astype(np.float64)
return factors / factors.mean()