perfect-postcode/pipeline/transform/add_online_listings.py
2026-02-15 22:39:54 +00:00

417 lines
14 KiB
Python

"""Add online buy/rent listings to wide.parquet as new rows.
Matches online listings to existing historical rows by postcode + fuzzy address,
carrying over historical prices and area-level data for matched properties.
Unmatched listings get area data from any same-postcode row in wide.
Modifies wide.parquet in-place, adding:
- A `Listing status` column to all rows ("Historical sale" / "For sale" / "For rent")
- New columns: Asking price, Asking rent (monthly), Bedrooms, Bathrooms,
Listing date, Property sub-type, Listing URL, Price qualifier
"""
import argparse
import re
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.utils.fuzzy_join import _numbers_compatible
_NORMALIZE_RE = re.compile(r"[,.\-]")
_WHITESPACE_RE = re.compile(r"\s+")
# Columns that are property-specific (carried from matched historical row only)
_PROPERTY_COLUMNS = [
"Last known price",
"Date of last transaction",
"historical_prices",
"renovation_history",
"Construction age",
"Is construction date approximate",
"Current energy rating",
"Potential energy rating",
"Address per EPC",
"Interior height (m)",
"Number of bedrooms & living rooms",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
# Columns that are area-level (carried from matched row, or any same-postcode row)
_AREA_COLUMNS = [
"Public transport to Bank (mins)",
"Cycling to Bank (mins)",
"Public transport to Fitzrovia (mins)",
"Cycling to Fitzrovia (mins)",
"Income Score (rate)",
"Employment Score (rate)",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
"% Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
"Estimated monthly rent",
"Criminal damage and arson (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Drugs (avg/yr)",
"Anti-social behaviour (avg/yr)",
"Public order (avg/yr)",
"Other crime (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Theft from the person (avg/yr)",
"Possession of weapons (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Robbery (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of public transport stations within 2km",
"Noise (dB)",
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Max available download speed (Mbps)",
"Collapsible deposits risk",
"Compressible ground risk",
"Landslide risk",
"Running sand risk",
"Shrink-swell risk",
"Soluble rocks risk",
"Environmental risk",
]
def _normalize(s: str) -> str:
return _WHITESPACE_RE.sub(" ", _NORMALIZE_RE.sub(" ", s.upper())).strip()
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
wide_entries, online_entries = args
pairs = []
for wide_idx, wide_address in wide_entries:
for online_idx, online_address in online_entries:
if not _numbers_compatible(wide_address, online_address):
continue
score = fuzz.token_sort_ratio(wide_address, online_address)
pairs.append((score, online_idx, wide_idx))
return pairs
def _load_online(buy_path: Path, rent_path: Path) -> pl.DataFrame:
"""Load buy + rent parquets, tag with channel, normalize rent to monthly."""
buy = pl.scan_parquet(buy_path).with_columns(
pl.lit("For sale").alias("_channel"),
)
rent = pl.scan_parquet(rent_path).with_columns(
pl.lit("For rent").alias("_channel"),
)
online = pl.concat([buy, rent]).collect()
# Normalize rent prices to monthly
freq = online["price_frequency"]
price = online["price"].cast(pl.Float64)
monthly_price = (
pl.when(freq == "weekly")
.then(price * 52.0 / 12.0)
.when(freq == "yearly")
.then(price / 12.0)
.when(freq == "daily")
.then(price * 365.25 / 12.0)
.when(freq == "quarterly")
.then(price / 3.0)
.otherwise(price) # monthly, not specified
.round(0)
.cast(pl.Int64)
)
online = online.with_columns(
pl.when(pl.col("_channel") == "For sale")
.then(pl.col("price"))
.otherwise(None)
.alias("Asking price"),
pl.when(pl.col("_channel") == "For rent")
.then(monthly_price)
.otherwise(None)
.alias("Asking rent (monthly)"),
)
return online
def _match_online_to_wide(
wide: pl.DataFrame,
online: pl.DataFrame,
) -> dict[int, int]:
"""Match online listings to wide rows by postcode + fuzzy address.
Returns dict mapping online row index → wide row index.
"""
# Build postcode → [(row_idx, normalized_address)] for wide
wide_postcodes = wide["Postcode"]
wide_addresses = wide["Address per Property Register"]
wide_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(wide.height):
pc = wide_postcodes[i]
addr = wide_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
wide_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build postcode → [(row_idx, normalized_address)] for online
online_postcodes = online["postcode"]
online_addresses = online["address"]
online_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i in range(online.height):
pc = online_postcodes[i]
addr = online_addresses[i]
if pc is not None and addr is not None:
pc_upper = pc.strip().upper()
online_by_postcode.setdefault(pc_upper, []).append((i, _normalize(addr)))
# Build tasks: only postcodes present in both
tasks = [
(wide_by_postcode[pc], online_entries)
for pc, online_entries in online_by_postcode.items()
if pc in wide_by_postcode
]
# Score in parallel
all_pairs: list[tuple[int, int, int]] = []
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc="Matching online listings",
):
all_pairs.extend(pairs)
del tasks, wide_by_postcode, online_by_postcode
# Greedy assignment: best score first, one-to-one
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
matches: dict[int, int] = {} # online_idx → wide_idx
matched_wide: set[int] = set()
for _score, online_idx, wide_idx in all_pairs:
if online_idx in matches or wide_idx in matched_wide:
continue
matches[online_idx] = wide_idx
matched_wide.add(wide_idx)
return matches
def _build_postcode_area_lookup(wide: pl.DataFrame) -> dict[str, int]:
"""Build postcode → first row index for area data fallback."""
postcodes = wide["Postcode"]
lookup: dict[str, int] = {}
for i in range(wide.height):
pc = postcodes[i]
if pc is not None:
pc_upper = pc.strip().upper()
if pc_upper not in lookup:
lookup[pc_upper] = i
return lookup
def _build_online_rows(
wide: pl.DataFrame,
online: pl.DataFrame,
matches: dict[int, int],
postcode_lookup: dict[str, int],
) -> pl.DataFrame:
"""Build a DataFrame of online listing rows with all wide.parquet columns."""
wide_schema = wide.schema
n = online.height
# Initialize all columns as null lists
columns: dict[str, list] = {col: [None] * n for col in wide_schema}
# Add new columns
columns["Listing status"] = [None] * n
columns["Asking price"] = [None] * n
columns["Asking rent (monthly)"] = [None] * n
columns["Bedrooms"] = [None] * n
columns["Bathrooms"] = [None] * n
columns["Listing date"] = [None] * n
columns["Property sub-type"] = [None] * n
columns["Listing URL"] = [None] * n
columns["Price qualifier"] = [None] * n
for i in range(n):
# Direct mappings from online listing
columns["Address per Property Register"][i] = online["address"][i]
columns["Postcode"][i] = online["postcode"][i]
columns["lat"][i] = online["latitude"][i]
columns["lon"][i] = online["longitude"][i]
columns["Property type"][i] = online["property_type"][i]
columns["Leashold/Freehold"][i] = online["tenure"][i]
columns["Total floor area (sqm)"][i] = online["floorspace_sqm"][i]
# New columns
columns["Listing status"][i] = online["_channel"][i]
columns["Asking price"][i] = online["Asking price"][i]
columns["Asking rent (monthly)"][i] = online["Asking rent (monthly)"][i]
columns["Bedrooms"][i] = online["bedrooms"][i]
columns["Bathrooms"][i] = online["bathrooms"][i]
columns["Property sub-type"][i] = online["property_sub_type"][i]
columns["Listing URL"][i] = online["url"][i]
columns["Price qualifier"][i] = online["price_qualifier"][i]
# Parse listing date
fvd = online["first_visible_date"][i]
if fvd is not None:
try:
from datetime import datetime
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
columns["Listing date"][i] = dt.replace(tzinfo=None)
except (ValueError, TypeError):
pass
# Determine source row for carried data
matched_wide_idx = matches.get(i)
postcode = online["postcode"][i]
pc_upper = postcode.strip().upper() if postcode else None
area_source_idx = matched_wide_idx
if area_source_idx is None and pc_upper is not None:
area_source_idx = postcode_lookup.get(pc_upper)
# Copy property-specific columns from matched row only
if matched_wide_idx is not None:
for col in _PROPERTY_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][matched_wide_idx]
# Copy area columns from matched row or same-postcode fallback
if area_source_idx is not None:
for col in _AREA_COLUMNS:
if col in wide_schema:
columns[col][i] = wide[col][area_source_idx]
# Build DataFrame with correct types
series_list = []
for col_name, dtype in wide_schema.items():
series_list.append(pl.Series(col_name, columns[col_name], dtype=dtype))
# New columns with their types
series_list.append(
pl.Series("Listing status", columns["Listing status"], dtype=pl.String)
)
series_list.append(
pl.Series("Asking price", columns["Asking price"], dtype=pl.Int64)
)
series_list.append(
pl.Series(
"Asking rent (monthly)", columns["Asking rent (monthly)"], dtype=pl.Int64
)
)
series_list.append(pl.Series("Bedrooms", columns["Bedrooms"], dtype=pl.Int32))
series_list.append(pl.Series("Bathrooms", columns["Bathrooms"], dtype=pl.Int32))
series_list.append(
pl.Series("Listing date", columns["Listing date"], dtype=pl.Datetime("us"))
)
series_list.append(
pl.Series("Property sub-type", columns["Property sub-type"], dtype=pl.String)
)
series_list.append(
pl.Series("Listing URL", columns["Listing URL"], dtype=pl.String)
)
series_list.append(
pl.Series("Price qualifier", columns["Price qualifier"], dtype=pl.String)
)
return pl.DataFrame(series_list)
def main():
parser = argparse.ArgumentParser(
description="Add online buy/rent listings to wide.parquet"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="wide.parquet path (modified in-place)",
)
parser.add_argument(
"--buy", type=Path, required=True, help="rightmove_buy.parquet path"
)
parser.add_argument(
"--rent", type=Path, required=True, help="rightmove_rent.parquet path"
)
args = parser.parse_args()
print("Loading wide.parquet...")
wide = pl.read_parquet(args.input)
print(f" {wide.height} rows, {wide.width} columns")
print("Loading online listings...")
online = _load_online(args.buy, args.rent)
print(
f" {online.height} online listings ({online.filter(pl.col('_channel') == 'For sale').height} buy, {online.filter(pl.col('_channel') == 'For rent').height} rent)"
)
print("Matching online listings to historical rows...")
matches = _match_online_to_wide(wide, online)
print(f" {len(matches)} online listings matched to historical rows")
print("Building postcode area lookup...")
postcode_lookup = _build_postcode_area_lookup(wide)
print("Building online listing rows...")
online_rows = _build_online_rows(wide, online, matches, postcode_lookup)
print(f" {online_rows.height} online rows built")
# Add Listing status + new columns to existing wide rows
wide = wide.with_columns(
pl.lit("Historical sale").alias("Listing status"),
pl.lit(None, dtype=pl.Int64).alias("Asking price"),
pl.lit(None, dtype=pl.Int64).alias("Asking rent (monthly)"),
pl.lit(None, dtype=pl.Int32).alias("Bedrooms"),
pl.lit(None, dtype=pl.Int32).alias("Bathrooms"),
pl.lit(None, dtype=pl.Datetime("us")).alias("Listing date"),
pl.lit(None, dtype=pl.String).alias("Property sub-type"),
pl.lit(None, dtype=pl.String).alias("Listing URL"),
pl.lit(None, dtype=pl.String).alias("Price qualifier"),
)
# Concat
result = pl.concat([wide, online_rows], how="diagonal_relaxed")
print(f"Final: {result.height} rows, {result.width} columns")
# Verify
status_counts = (
result["Listing status"].value_counts().sort("count", descending=True)
)
print(f"Listing status distribution:\n{status_counts}")
result.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"Wrote {args.input} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()