changes
This commit is contained in:
parent
524580eb25
commit
ffe080adef
82 changed files with 2652 additions and 2956 deletions
|
|
@ -1,417 +0,0 @@
|
|||
"""Add online buy/rent listings to wide.parquet as new rows.
|
||||
|
||||
Matches online listings to existing historical rows by postcode + fuzzy address,
|
||||
carrying over historical prices and area-level data for matched properties.
|
||||
Unmatched listings get area data from any same-postcode row in wide.
|
||||
|
||||
Modifies wide.parquet in-place, adding:
|
||||
- A `Listing status` column to all rows ("Historical sale" / "For sale" / "For rent")
|
||||
- New columns: Asking price, Asking rent (monthly), Bedrooms, Bathrooms,
|
||||
Listing date, Property sub-type, Listing URL, Price qualifier
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from os import cpu_count
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from thefuzz import fuzz
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.fuzzy_join import _numbers_compatible
|
||||
|
||||
_NORMALIZE_RE = re.compile(r"[,.\-]")
|
||||
_WHITESPACE_RE = re.compile(r"\s+")
|
||||
|
||||
# Columns that are property-specific (carried from matched historical row only)
|
||||
_PROPERTY_COLUMNS = [
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"historical_prices",
|
||||
"renovation_history",
|
||||
"Construction age",
|
||||
"Is construction date approximate",
|
||||
"Current energy rating",
|
||||
"Potential energy rating",
|
||||
"Address per EPC",
|
||||
"Interior height (m)",
|
||||
"Number of bedrooms & living rooms",
|
||||
"Price per sqm",
|
||||
"Estimated current price",
|
||||
"Est. price per sqm",
|
||||
]
|
||||
|
||||
# Columns that are area-level (carried from matched row, or any same-postcode row)
|
||||
_AREA_COLUMNS = [
|
||||
"Public transport to Bank (mins)",
|
||||
"Cycling to Bank (mins)",
|
||||
"Public transport to Fitzrovia (mins)",
|
||||
"Cycling to Fitzrovia (mins)",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Education, Skills and Training Score",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Living Environment Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
"% Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
"% Other",
|
||||
"Estimated monthly rent",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Serious crime (avg/yr)",
|
||||
"Minor crime (avg/yr)",
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 2km",
|
||||
"Number of public transport stations within 2km",
|
||||
"Noise (dB)",
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
"Max available download speed (Mbps)",
|
||||
"Collapsible deposits risk",
|
||||
"Compressible ground risk",
|
||||
"Landslide risk",
|
||||
"Running sand risk",
|
||||
"Shrink-swell risk",
|
||||
"Soluble rocks risk",
|
||||
"Environmental risk",
|
||||
]
|
||||
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
return _WHITESPACE_RE.sub(" ", _NORMALIZE_RE.sub(" ", s.upper())).strip()
|
||||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
wide_entries, online_entries = args
|
||||
pairs = []
|
||||
for wide_idx, wide_address in wide_entries:
|
||||
for online_idx, online_address in online_entries:
|
||||
if not _numbers_compatible(wide_address, online_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(wide_address, online_address)
|
||||
pairs.append((score, online_idx, wide_idx))
|
||||
return pairs
|
||||
|
||||
|
||||
def _load_online(buy_path: Path, rent_path: Path) -> pl.DataFrame:
|
||||
"""Load buy + rent parquets, tag with channel, normalize rent to monthly."""
|
||||
buy = pl.scan_parquet(buy_path).with_columns(
|
||||
pl.lit("For sale").alias("_channel"),
|
||||
)
|
||||
rent = pl.scan_parquet(rent_path).with_columns(
|
||||
pl.lit("For rent").alias("_channel"),
|
||||
)
|
||||
|
||||
online = pl.concat([buy, rent]).collect()
|
||||
|
||||
# Normalize rent prices to monthly
|
||||
freq = online["price_frequency"]
|
||||
price = online["price"].cast(pl.Float64)
|
||||
monthly_price = (
|
||||
pl.when(freq == "weekly")
|
||||
.then(price * 52.0 / 12.0)
|
||||
.when(freq == "yearly")
|
||||
.then(price / 12.0)
|
||||
.when(freq == "daily")
|
||||
.then(price * 365.25 / 12.0)
|
||||
.when(freq == "quarterly")
|
||||
.then(price / 3.0)
|
||||
.otherwise(price) # monthly, not specified
|
||||
.round(0)
|
||||
.cast(pl.Int64)
|
||||
)
|
||||
|
||||
online = online.with_columns(
|
||||
pl.when(pl.col("_channel") == "For sale")
|
||||
.then(pl.col("price"))
|
||||
.otherwise(None)
|
||||
.alias("Asking price"),
|
||||
pl.when(pl.col("_channel") == "For rent")
|
||||
.then(monthly_price)
|
||||
.otherwise(None)
|
||||
.alias("Asking rent (monthly)"),
|
||||
)
|
||||
|
||||
return online
|
||||
|
||||
|
||||
def _match_online_to_wide(
|
||||
wide: pl.DataFrame,
|
||||
online: pl.DataFrame,
|
||||
) -> dict[int, int]:
|
||||
"""Match online listings to wide rows by postcode + fuzzy address.
|
||||
|
||||
Returns dict mapping online row index → wide row index.
|
||||
"""
|
||||
# Build postcode → [(row_idx, normalized_address)] for wide
|
||||
wide_postcodes = wide["Postcode"]
|
||||
wide_addresses = wide["Address per Property Register"]
|
||||
|
||||
wide_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for i in range(wide.height):
|
||||
pc = wide_postcodes[i]
|
||||
addr = wide_addresses[i]
|
||||
if pc is not None and addr is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
wide_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
|
||||
|
||||
# Build postcode → [(row_idx, normalized_address)] for online
|
||||
online_postcodes = online["postcode"]
|
||||
online_addresses = online["address"]
|
||||
|
||||
online_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for i in range(online.height):
|
||||
pc = online_postcodes[i]
|
||||
addr = online_addresses[i]
|
||||
if pc is not None and addr is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
online_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
|
||||
|
||||
# Build tasks: only postcodes present in both
|
||||
tasks = [
|
||||
(wide_by_postcode[pc], online_entries)
|
||||
for pc, online_entries in online_by_postcode.items()
|
||||
if pc in wide_by_postcode
|
||||
]
|
||||
|
||||
# Score in parallel
|
||||
all_pairs: list[tuple[int, int, int]] = []
|
||||
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
|
||||
for pairs in tqdm(
|
||||
executor.map(_score_bucket, tasks, chunksize=64),
|
||||
total=len(tasks),
|
||||
desc="Matching online listings",
|
||||
):
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
del tasks, wide_by_postcode, online_by_postcode
|
||||
|
||||
# Greedy assignment: best score first, one-to-one
|
||||
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
|
||||
|
||||
matches: dict[int, int] = {} # online_idx → wide_idx
|
||||
matched_wide: set[int] = set()
|
||||
|
||||
for _score, online_idx, wide_idx in all_pairs:
|
||||
if online_idx in matches or wide_idx in matched_wide:
|
||||
continue
|
||||
matches[online_idx] = wide_idx
|
||||
matched_wide.add(wide_idx)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def _build_postcode_area_lookup(wide: pl.DataFrame) -> dict[str, int]:
|
||||
"""Build postcode → first row index for area data fallback."""
|
||||
postcodes = wide["Postcode"]
|
||||
lookup: dict[str, int] = {}
|
||||
for i in range(wide.height):
|
||||
pc = postcodes[i]
|
||||
if pc is not None:
|
||||
pc_upper = pc.strip().upper()
|
||||
if pc_upper not in lookup:
|
||||
lookup[pc_upper] = i
|
||||
return lookup
|
||||
|
||||
|
||||
def _build_online_rows(
|
||||
wide: pl.DataFrame,
|
||||
online: pl.DataFrame,
|
||||
matches: dict[int, int],
|
||||
postcode_lookup: dict[str, int],
|
||||
) -> pl.DataFrame:
|
||||
"""Build a DataFrame of online listing rows with all wide.parquet columns."""
|
||||
wide_schema = wide.schema
|
||||
n = online.height
|
||||
|
||||
# Initialize all columns as null lists
|
||||
columns: dict[str, list] = {col: [None] * n for col in wide_schema}
|
||||
|
||||
# Add new columns
|
||||
columns["Listing status"] = [None] * n
|
||||
columns["Asking price"] = [None] * n
|
||||
columns["Asking rent (monthly)"] = [None] * n
|
||||
columns["Bedrooms"] = [None] * n
|
||||
columns["Bathrooms"] = [None] * n
|
||||
columns["Listing date"] = [None] * n
|
||||
columns["Property sub-type"] = [None] * n
|
||||
columns["Listing URL"] = [None] * n
|
||||
columns["Price qualifier"] = [None] * n
|
||||
|
||||
for i in range(n):
|
||||
# Direct mappings from online listing
|
||||
columns["Address per Property Register"][i] = online["address"][i]
|
||||
columns["Postcode"][i] = online["postcode"][i]
|
||||
columns["lat"][i] = online["latitude"][i]
|
||||
columns["lon"][i] = online["longitude"][i]
|
||||
columns["Property type"][i] = online["property_type"][i]
|
||||
columns["Leashold/Freehold"][i] = online["tenure"][i]
|
||||
columns["Total floor area (sqm)"][i] = online["floorspace_sqm"][i]
|
||||
|
||||
# New columns
|
||||
columns["Listing status"][i] = online["_channel"][i]
|
||||
columns["Asking price"][i] = online["Asking price"][i]
|
||||
columns["Asking rent (monthly)"][i] = online["Asking rent (monthly)"][i]
|
||||
columns["Bedrooms"][i] = online["bedrooms"][i]
|
||||
columns["Bathrooms"][i] = online["bathrooms"][i]
|
||||
columns["Property sub-type"][i] = online["property_sub_type"][i]
|
||||
columns["Listing URL"][i] = online["url"][i]
|
||||
columns["Price qualifier"][i] = online["price_qualifier"][i]
|
||||
|
||||
# Parse listing date
|
||||
fvd = online["first_visible_date"][i]
|
||||
if fvd is not None:
|
||||
try:
|
||||
from datetime import datetime
|
||||
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
columns["Listing date"][i] = dt.replace(tzinfo=None)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Determine source row for carried data
|
||||
matched_wide_idx = matches.get(i)
|
||||
postcode = online["postcode"][i]
|
||||
pc_upper = postcode.strip().upper() if postcode else None
|
||||
area_source_idx = matched_wide_idx
|
||||
if area_source_idx is None and pc_upper is not None:
|
||||
area_source_idx = postcode_lookup.get(pc_upper)
|
||||
|
||||
# Copy property-specific columns from matched row only
|
||||
if matched_wide_idx is not None:
|
||||
for col in _PROPERTY_COLUMNS:
|
||||
if col in wide_schema:
|
||||
columns[col][i] = wide[col][matched_wide_idx]
|
||||
|
||||
# Copy area columns from matched row or same-postcode fallback
|
||||
if area_source_idx is not None:
|
||||
for col in _AREA_COLUMNS:
|
||||
if col in wide_schema:
|
||||
columns[col][i] = wide[col][area_source_idx]
|
||||
|
||||
# Build DataFrame with correct types
|
||||
series_list = []
|
||||
for col_name, dtype in wide_schema.items():
|
||||
series_list.append(pl.Series(col_name, columns[col_name], dtype=dtype))
|
||||
|
||||
# New columns with their types
|
||||
series_list.append(
|
||||
pl.Series("Listing status", columns["Listing status"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Asking price", columns["Asking price"], dtype=pl.Int64)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series(
|
||||
"Asking rent (monthly)", columns["Asking rent (monthly)"], dtype=pl.Int64
|
||||
)
|
||||
)
|
||||
series_list.append(pl.Series("Bedrooms", columns["Bedrooms"], dtype=pl.Int32))
|
||||
series_list.append(pl.Series("Bathrooms", columns["Bathrooms"], dtype=pl.Int32))
|
||||
series_list.append(
|
||||
pl.Series("Listing date", columns["Listing date"], dtype=pl.Datetime("us"))
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Property sub-type", columns["Property sub-type"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Listing URL", columns["Listing URL"], dtype=pl.String)
|
||||
)
|
||||
series_list.append(
|
||||
pl.Series("Price qualifier", columns["Price qualifier"], dtype=pl.String)
|
||||
)
|
||||
|
||||
return pl.DataFrame(series_list)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Add online buy/rent listings to wide.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="wide.parquet path (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--buy", type=Path, required=True, help="rightmove_buy.parquet path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rent", type=Path, required=True, help="rightmove_rent.parquet path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
wide = pl.read_parquet(args.input)
|
||||
print(f" {wide.height} rows, {wide.width} columns")
|
||||
|
||||
print("Loading online listings...")
|
||||
online = _load_online(args.buy, args.rent)
|
||||
print(
|
||||
f" {online.height} online listings ({online.filter(pl.col('_channel') == 'For sale').height} buy, {online.filter(pl.col('_channel') == 'For rent').height} rent)"
|
||||
)
|
||||
|
||||
print("Matching online listings to historical rows...")
|
||||
matches = _match_online_to_wide(wide, online)
|
||||
print(f" {len(matches)} online listings matched to historical rows")
|
||||
|
||||
print("Building postcode area lookup...")
|
||||
postcode_lookup = _build_postcode_area_lookup(wide)
|
||||
|
||||
print("Building online listing rows...")
|
||||
online_rows = _build_online_rows(wide, online, matches, postcode_lookup)
|
||||
print(f" {online_rows.height} online rows built")
|
||||
|
||||
# Add Listing status + new columns to existing wide rows
|
||||
wide = wide.with_columns(
|
||||
pl.lit("Historical sale").alias("Listing status"),
|
||||
pl.lit(None, dtype=pl.Int64).alias("Asking price"),
|
||||
pl.lit(None, dtype=pl.Int64).alias("Asking rent (monthly)"),
|
||||
pl.lit(None, dtype=pl.Int32).alias("Bedrooms"),
|
||||
pl.lit(None, dtype=pl.Int32).alias("Bathrooms"),
|
||||
pl.lit(None, dtype=pl.Datetime("us")).alias("Listing date"),
|
||||
pl.lit(None, dtype=pl.String).alias("Property sub-type"),
|
||||
pl.lit(None, dtype=pl.String).alias("Listing URL"),
|
||||
pl.lit(None, dtype=pl.String).alias("Price qualifier"),
|
||||
)
|
||||
|
||||
# Concat
|
||||
result = pl.concat([wide, online_rows], how="diagonal_relaxed")
|
||||
print(f"Final: {result.height} rows, {result.width} columns")
|
||||
|
||||
# Verify
|
||||
status_counts = (
|
||||
result["Listing status"].value_counts().sort("count", descending=True)
|
||||
)
|
||||
print(f"Listing status distribution:\n{status_counts}")
|
||||
|
||||
result.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.input} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -30,7 +30,69 @@ def _join_journey_times(
|
|||
return wide.join(journey_times, on="postcode", how="left")
|
||||
|
||||
|
||||
def _build_wide(
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
"lat",
|
||||
"lon",
|
||||
# Transport
|
||||
"Public transport to Bank (mins)",
|
||||
"Cycling to Bank (mins)",
|
||||
"Public transport to Fitzrovia (mins)",
|
||||
"Cycling to Fitzrovia (mins)",
|
||||
# Deprivation
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Education, Skills and Training Score",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Living Environment Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
# Ethnicity
|
||||
"% Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
"% Other",
|
||||
# Crime
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
"Serious crime (avg/yr)",
|
||||
"Minor crime (avg/yr)",
|
||||
# Amenities
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 2km",
|
||||
"Number of public transport stations within 2km",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
# GeoSure
|
||||
"Environmental risk",
|
||||
"Collapsible deposits risk",
|
||||
"Compressible ground risk",
|
||||
"Landslide risk",
|
||||
"Running sand risk",
|
||||
"Shrink-swell risk",
|
||||
"Soluble rocks risk",
|
||||
]
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
iod_path: Path,
|
||||
|
|
@ -44,8 +106,11 @@ def _build_wide(
|
|||
broadband_path: Path,
|
||||
geosure_path: Path,
|
||||
rental_prices_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
Returns (postcode_df, properties_df).
|
||||
"""
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
|
|
@ -180,7 +245,7 @@ def _build_wide(
|
|||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
)
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
geosure = pl.scan_parquet(geosure_path)
|
||||
wide = wide.join(geosure, on="postcode", how="left")
|
||||
|
|
@ -280,7 +345,18 @@ def _build_wide(
|
|||
)
|
||||
|
||||
print("Collecting with streaming engine...")
|
||||
return wide.collect(engine="streaming")
|
||||
df = wide.collect(engine="streaming")
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
return postcode_df, properties_df
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -356,11 +432,14 @@ def main():
|
|||
help="ONS rental prices by LA and bedroom count parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
wide = _build_wide(
|
||||
postcode_df, properties_df = _build(
|
||||
epc_pp_path=args.epc_pp,
|
||||
arcgis_path=args.arcgis,
|
||||
iod_path=args.iod,
|
||||
|
|
@ -376,13 +455,17 @@ def main():
|
|||
rental_prices_path=args.rental_prices,
|
||||
)
|
||||
|
||||
print(f"Columns: {wide.columns}")
|
||||
print(f"Rows: {wide.height}")
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
print(f"Postcode rows: {postcode_df.height}")
|
||||
postcode_df.write_parquet(args.output_postcodes)
|
||||
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
|
||||
|
||||
wide.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f"\nProperty columns: {properties_df.columns}")
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
properties_df.write_parquet(args.output_properties)
|
||||
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ from pipeline.transform.price_estimation.knn import (
|
|||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
compute_seasonal_factors,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
|
|
@ -91,7 +90,7 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction with interpolation, capping, and seasonal adjustment."""
|
||||
"""Index-based prediction with interpolation and capping."""
|
||||
test = interpolate_log_index(
|
||||
index, test, "sector", "type_group", "input_frac_year", "log_index_input"
|
||||
)
|
||||
|
|
@ -105,7 +104,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
* (pl.col("log_index_actual") - pl.col("log_index_input"))
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
* pl.col("_seasonal_adj")
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
.alias("predicted"),
|
||||
|
|
@ -175,7 +173,10 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backtest price estimation model")
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Path to wide.parquet"
|
||||
"--input", type=Path, required=True, help="Path to properties.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output backtest_results.parquet"
|
||||
|
|
@ -184,38 +185,28 @@ def main():
|
|||
|
||||
# Build index from pre-test data only (temporal holdout)
|
||||
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
|
||||
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN)
|
||||
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
|
||||
print(
|
||||
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
)
|
||||
|
||||
# Compute seasonal factors from pre-test data only
|
||||
seasonal = compute_seasonal_factors(args.input, max_sale_year=TEST_YEAR_MIN)
|
||||
months = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
|
||||
]
|
||||
print(
|
||||
f"Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
|
||||
)
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
# Compute seasonal adjustment for each test pair
|
||||
input_months = test["input_month"].fill_null(6).to_numpy().astype(np.int32)
|
||||
actual_months = test["actual_month"].fill_null(6).to_numpy().astype(np.int32)
|
||||
seasonal_adj = seasonal[actual_months - 1] / seasonal[input_months - 1]
|
||||
test = test.with_columns(
|
||||
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
|
||||
)
|
||||
# Join lat/lon from postcode.parquet (properties.parquet no longer has them)
|
||||
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
|
||||
test = test.join(postcodes, on="Postcode", how="left")
|
||||
|
||||
print("\nPredicting with price index...")
|
||||
test = predict(test, index)
|
||||
|
||||
# --- kNN ---
|
||||
ref_fy = float(TEST_YEAR_MIN)
|
||||
trees = build_knn_pool(args.input, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
|
||||
# Pass joined LazyFrame (with lat/lon) instead of raw properties path
|
||||
pool_lf = pl.scan_parquet(args.input).join(
|
||||
postcodes.lazy(), on="Postcode", how="left"
|
||||
)
|
||||
trees = build_knn_pool(pool_lf, index, ref_fy, max_sale_year=TEST_YEAR_MIN)
|
||||
|
||||
# Interpolate log_index at reference year for temporal adjustment
|
||||
test = test.with_columns(pl.lit(ref_fy).alias("_ref_fy"))
|
||||
|
|
|
|||
|
|
@ -1,19 +1,18 @@
|
|||
"""Augment wide.parquet with estimated current prices.
|
||||
"""Augment properties.parquet with estimated current prices.
|
||||
|
||||
For properties with a known prior sale, applies the repeat-sales price index
|
||||
to adjust the last known price to the current date, then blends with kNN
|
||||
estimates from nearby recently-sold properties. Includes:
|
||||
- Capping extreme index adjustments
|
||||
- Seasonal month-of-sale adjustment
|
||||
- kNN spatial blending
|
||||
|
||||
Modifies wide.parquet in-place.
|
||||
Modifies properties.parquet in-place. Temporarily joins postcode.parquet
|
||||
for lat/lon needed by kNN, then drops those columns before writing.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
|
|
@ -23,9 +22,7 @@ from pipeline.transform.price_estimation.knn import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_FRAC_YEAR,
|
||||
CURRENT_MONTH,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
compute_seasonal_factors,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
|
|
@ -34,48 +31,39 @@ from pipeline.transform.price_estimation.utils import (
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Augment wide.parquet with estimated current prices"
|
||||
description="Augment properties.parquet with estimated current prices"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
"--properties",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to wide.parquet (modified in-place)",
|
||||
help="Path to properties.parquet (modified in-place)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to postcode.parquet (for lat/lon needed by kNN)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index", type=Path, required=True, help="Path to price_index.parquet"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print("Loading properties.parquet...")
|
||||
df = pl.read_parquet(args.properties)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Join lat/lon from postcode.parquet for kNN spatial queries
|
||||
postcodes = pl.read_parquet(args.postcodes).select("Postcode", "lat", "lon")
|
||||
df = df.join(postcodes, on="Postcode", how="left")
|
||||
print(f" Joined lat/lon from {len(postcodes):,} postcodes")
|
||||
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Compute seasonal factors
|
||||
seasonal = compute_seasonal_factors(args.input)
|
||||
months = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
|
||||
]
|
||||
print(
|
||||
f" Seasonal factors: {', '.join(f'{m}={f:.3f}' for m, f in zip(months, seasonal))}"
|
||||
)
|
||||
|
||||
# Build seasonal adjustment: seasonal[current_month] / seasonal[sale_month]
|
||||
sale_month = (
|
||||
df["Date of last transaction"]
|
||||
.dt.month()
|
||||
.fill_null(6)
|
||||
.to_numpy()
|
||||
.astype(np.int32)
|
||||
)
|
||||
seasonal_adj = seasonal[CURRENT_MONTH - 1] / seasonal[sale_month - 1]
|
||||
|
||||
# Derive helper columns
|
||||
df = df.with_columns(
|
||||
sector_expr().alias("_sector"),
|
||||
|
|
@ -86,7 +74,6 @@ def main():
|
|||
).alias("_sale_frac_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
pl.lit(CURRENT_FRAC_YEAR).alias("_current_frac_year"),
|
||||
pl.Series("_seasonal_adj", seasonal_adj, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
|
|
@ -109,7 +96,7 @@ def main():
|
|||
"_log_index_current_interp",
|
||||
)
|
||||
|
||||
# Compute index-adjusted estimate with cap and seasonal adjustment
|
||||
# Compute index-adjusted estimate with cap
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
|
|
@ -125,7 +112,6 @@ def main():
|
|||
)
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
* pl.col("_seasonal_adj")
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
|
|
@ -140,7 +126,7 @@ def main():
|
|||
|
||||
# --- kNN blending ---
|
||||
print("\nBuilding kNN estimates...")
|
||||
trees = build_knn_pool(args.input, index, CURRENT_FRAC_YEAR)
|
||||
trees = build_knn_pool(df.lazy(), index, CURRENT_FRAC_YEAR)
|
||||
|
||||
lat = df["lat"].cast(pl.Float64).to_numpy()
|
||||
lon = df["lon"].cast(pl.Float64).to_numpy()
|
||||
|
|
@ -188,13 +174,13 @@ def main():
|
|||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
# Drop all temporary columns
|
||||
# Drop all temporary columns and joined lat/lon (those belong in postcode.parquet)
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
df = df.drop(temp_cols)
|
||||
df = df.drop(temp_cols).drop("lat", "lon")
|
||||
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
df.write_parquet(args.properties)
|
||||
size_mb = args.properties.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.properties} ({size_mb:.1f} MB)")
|
||||
print(
|
||||
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -328,14 +328,19 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
|||
return filled
|
||||
|
||||
|
||||
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
|
||||
def build_index(
|
||||
input_path: Path,
|
||||
max_pair_year: int | None = None,
|
||||
postcodes_path: Path | None = None,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the full price index from raw data.
|
||||
|
||||
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
|
||||
The index is still forward-filled to CURRENT_YEAR.
|
||||
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
|
||||
"""
|
||||
pairs = extract_pairs(input_path, max_year2=max_pair_year)
|
||||
centroids = extract_centroids(input_path)
|
||||
centroids = extract_centroids(postcodes_path or input_path)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = CURRENT_YEAR
|
||||
|
|
@ -448,10 +453,12 @@ def main():
|
|||
description="Build improved repeat-sales price index"
|
||||
)
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--postcodes", type=Path, required=True,
|
||||
help="Path to postcode.parquet (for lat/lon centroids)")
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = build_index(args.input)
|
||||
result = build_index(args.input, postcodes_path=args.postcodes)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
|
|||
|
||||
|
||||
def build_knn_pool(
|
||||
input_path: Path,
|
||||
source: Path | pl.LazyFrame,
|
||||
index: pl.DataFrame,
|
||||
ref_frac_year: float,
|
||||
max_sale_year: int | None = None,
|
||||
|
|
@ -42,8 +42,9 @@ def build_knn_pool(
|
|||
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
|
||||
"""
|
||||
print("Building kNN pool...")
|
||||
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
|
||||
query = (
|
||||
pl.scan_parquet(input_path)
|
||||
lf
|
||||
.select(
|
||||
"Postcode",
|
||||
"Property type",
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
"""Shared utilities for price estimation modules."""
|
||||
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
|
@ -9,7 +8,6 @@ import polars as pl
|
|||
CURRENT_YEAR = 2026
|
||||
_today = date.today()
|
||||
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
|
||||
CURRENT_MONTH = _today.month
|
||||
|
||||
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
|
||||
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
|
||||
|
|
@ -181,53 +179,3 @@ def join_type_stratified_index(
|
|||
).drop(_typed, _all)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def compute_seasonal_factors(
|
||||
input_path: Path, max_sale_year: int | None = None
|
||||
) -> np.ndarray:
|
||||
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
|
||||
|
||||
Detrends by normalizing median £/sqm within each year, then averages
|
||||
across years. Returns array of 12 factors (index 0 = January).
|
||||
Normalized so mean = 1.0.
|
||||
"""
|
||||
query = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Last known price") > 0,
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Date of last transaction").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
|
||||
).alias("psm"),
|
||||
pl.col("Date of last transaction").dt.month().alias("month"),
|
||||
pl.col("Date of last transaction").dt.year().alias("year"),
|
||||
)
|
||||
)
|
||||
if max_sale_year is not None:
|
||||
query = query.filter(pl.col("year") < max_sale_year)
|
||||
|
||||
monthly = (
|
||||
query.group_by("year", "month")
|
||||
.agg(pl.col("psm").median().alias("median_psm"))
|
||||
.with_columns(
|
||||
pl.col("median_psm").mean().over("year").alias("year_mean"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
|
||||
)
|
||||
.group_by("month")
|
||||
.agg(pl.col("ratio").mean().alias("factor"))
|
||||
.sort("month")
|
||||
.collect()
|
||||
)
|
||||
|
||||
factors = monthly["factor"].to_numpy().astype(np.float64)
|
||||
return factors / factors.mean()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue