finder improvements

This commit is contained in:
Andras Schmelczer 2026-03-25 08:06:05 +00:00
parent 30055ab870
commit 3a3e249bdd
6 changed files with 225 additions and 39 deletions

View file

@ -4,6 +4,7 @@ from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import normalize_price
log = logging.getLogger("rightmove")
@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
log.warning("No properties to write to %s", path)
return
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
bad_count = 0
for p in properties:
for key in ("Bedrooms", "Bathrooms"):
val = p.get(key, 0) or 0
if val > MAX_BEDROOMS:
bad_count += 1
p[key] = None
# Recompute derived field after sanitization
beds = p.get("Bedrooms")
baths = p.get("Bathrooms")
if beds is None or baths is None:
p["Number of bedrooms & living rooms"] = None
else:
p["Number of bedrooms & living rooms"] = beds + baths
if bad_count:
log.warning(
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
bad_count,
MAX_BEDROOMS,
)
# Parse first_visible_date to datetime
listing_dates = []
for p in properties:
@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
listing_dates.append(None)
# Derive asking price / asking rent based on channel
# Zero prices indicate parsing failures or POA/auction listings — treat as null
if channel == "buy":
asking_prices = [p["price"] for p in properties]
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
asking_rents = [None] * len(properties)
listing_statuses = ["For sale"] * len(properties)
else:
asking_prices = [None] * len(properties)
asking_rents = [
normalize_price(p["price"], p["price_frequency"]) for p in properties
]
# Normalize to monthly, then apply sanity bounds. Rents outside
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
# pricing (short lets), annual rents mislabelled as monthly, or £0
# placeholders — null them out rather than polluting aggregates.
rent_outliers = 0
asking_rents = []
for p in properties:
monthly = normalize_price(p["price"], p["price_frequency"])
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
rent_outliers += 1
asking_rents.append(None)
else:
asking_rents.append(monthly)
if rent_outliers:
log.warning(
"Nulled %d rent outliers outside [£%d, £%d]/month",
rent_outliers,
MIN_RENT_MONTHLY,
MAX_RENT_MONTHLY,
)
listing_statuses = ["For rent"] * len(properties)
df = pl.DataFrame(