This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -4,17 +4,14 @@ from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import map_property_type, normalize_postcode, normalize_price
from constants import MAX_BEDROOMS
from transform import map_property_type, normalize_postcode
log = logging.getLogger("rightmove")
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
"""Write properties list to parquet with server-ready column names.
channel: "buy" or "rent"
"""
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write sale properties list to parquet with server-ready column names."""
if not properties:
log.warning("No properties to write to %s", path)
return
@ -69,7 +66,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
listing_dates.append(dt)
except (ValueError, TypeError):
# Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
# Try additional date formats used by scraped listing sources.
parsed = None
stripped = fvd.strip()
lower = stripped.lower()
@ -93,35 +90,9 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
else:
listing_dates.append(None)
# Derive asking price / asking rent based on channel
# Zero prices indicate parsing failures or POA/auction listings — treat as null
if channel == "buy":
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
asking_rents = [None] * len(properties)
listing_statuses = ["For sale"] * len(properties)
else:
asking_prices = [None] * len(properties)
# Normalize to monthly, then apply sanity bounds. Rents outside
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
# pricing (short lets), annual rents mislabelled as monthly, or £0
# placeholders — null them out rather than polluting aggregates.
rent_outliers = 0
asking_rents = []
for p in properties:
monthly = normalize_price(p["price"], p["price_frequency"])
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
rent_outliers += 1
asking_rents.append(None)
else:
asking_rents.append(monthly)
if rent_outliers:
log.warning(
"Nulled %d rent outliers outside [£%d, £%d]/month",
rent_outliers,
MIN_RENT_MONTHLY,
MAX_RENT_MONTHLY,
)
listing_statuses = ["For rent"] * len(properties)
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
listing_statuses = ["For sale"] * len(properties)
df = pl.DataFrame(
{
@ -146,7 +117,6 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
"Listing date": listing_dates,
"Listing status": listing_statuses,
"Asking price": asking_prices,
"Asking rent (monthly)": asking_rents,
},
schema={
"Bedrooms": pl.Int32,
@ -166,18 +136,15 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking rent (monthly)": pl.Int64,
},
)
# Derive asking price per sqm for buy listings
if channel == "buy":
df = df.with_columns(
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
)
df = df.with_columns(
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)