import logging from datetime import datetime from pathlib import Path import polars as pl from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY from transform import map_property_type, normalize_price log = logging.getLogger("rightmove") def write_parquet(properties: list[dict], path: Path, channel: str) -> None: """Write properties list to parquet with server-ready column names. channel: "buy" or "rent" """ if not properties: log.warning("No properties to write to %s", path) return # Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are # almost certainly prices or other numeric fields mis-parsed as bedrooms. bad_count = 0 for p in properties: for key in ("Bedrooms", "Bathrooms"): val = p.get(key, 0) or 0 if val > MAX_BEDROOMS: bad_count += 1 p[key] = None # Recompute derived field after sanitization beds = p.get("Bedrooms") baths = p.get("Bathrooms") if beds is None or baths is None: p["Number of bedrooms & living rooms"] = None else: p["Number of bedrooms & living rooms"] = beds + baths if bad_count: log.warning( "Sanitized %d properties with bedroom/bathroom counts > %d (set to null)", bad_count, MAX_BEDROOMS, ) # Re-derive Property type from Property sub-type using current PROPERTY_TYPE_MAP. # This retroactively fixes data scraped with older versions of the type map. remapped = 0 for p in properties: sub_type = p.get("Property sub-type", "") if sub_type and sub_type != "Unknown": new_type = map_property_type(sub_type) if new_type != p.get("Property type"): p["Property type"] = new_type remapped += 1 if remapped: log.info("Re-mapped %d property types from sub-types", remapped) # Parse first_visible_date to datetime listing_dates = [] for p in properties: fvd = p.get("first_visible_date", "") if fvd: try: dt = datetime.fromisoformat(fvd.replace("Z", "+00:00")) # Convert to UTC naive datetime for consistent storage if dt.tzinfo is not None: from datetime import timezone dt = dt.astimezone(timezone.utc).replace(tzinfo=None) listing_dates.append(dt) except (ValueError, TypeError): # Try additional date formats (OpenRent: "DD Month, YYYY", "Today") parsed = None stripped = fvd.strip() lower = stripped.lower() if lower == "today": parsed = datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 ) elif lower == "tomorrow": from datetime import timedelta parsed = ( datetime.now() + timedelta(days=1) ).replace(hour=0, minute=0, second=0, microsecond=0) else: for fmt in ("%d %B, %Y", "%d %B %Y"): try: parsed = datetime.strptime(stripped, fmt) break except ValueError: continue listing_dates.append(parsed) else: listing_dates.append(None) # Derive asking price / asking rent based on channel # Zero prices indicate parsing failures or POA/auction listings — treat as null if channel == "buy": asking_prices = [p["price"] if p["price"] > 0 else None for p in properties] asking_rents = [None] * len(properties) listing_statuses = ["For sale"] * len(properties) else: asking_prices = [None] * len(properties) # Normalize to monthly, then apply sanity bounds. Rents outside # [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay # pricing (short lets), annual rents mislabelled as monthly, or £0 # placeholders — null them out rather than polluting aggregates. rent_outliers = 0 asking_rents = [] for p in properties: monthly = normalize_price(p["price"], p["price_frequency"]) if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY: rent_outliers += 1 asking_rents.append(None) else: asking_rents.append(monthly) if rent_outliers: log.warning( "Nulled %d rent outliers outside [£%d, £%d]/month", rent_outliers, MIN_RENT_MONTHLY, MAX_RENT_MONTHLY, ) listing_statuses = ["For rent"] * len(properties) df = pl.DataFrame( { "Bedrooms": [p["Bedrooms"] for p in properties], "Bathrooms": [p["Bathrooms"] for p in properties], "Number of bedrooms & living rooms": [ p["Number of bedrooms & living rooms"] for p in properties ], "lon": [p["lon"] for p in properties], "lat": [p["lat"] for p in properties], "Postcode": [p["Postcode"] for p in properties], "Address per Property Register": [ p["Address per Property Register"] for p in properties ], "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties], "Property type": [p["Property type"] for p in properties], "Property sub-type": [p["Property sub-type"] for p in properties], "Price qualifier": [p["Price qualifier"] for p in properties], "Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties], "Listing URL": [p["Listing URL"] for p in properties], "Listing features": [p["Listing features"] for p in properties], "Listing date": listing_dates, "Listing status": listing_statuses, "Asking price": asking_prices, "Asking rent (monthly)": asking_rents, }, schema={ "Bedrooms": pl.Int32, "Bathrooms": pl.Int32, "Number of bedrooms & living rooms": pl.Int32, "lon": pl.Float64, "lat": pl.Float64, "Postcode": pl.Utf8, "Address per Property Register": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Property type": pl.Utf8, "Property sub-type": pl.Utf8, "Price qualifier": pl.Utf8, "Total floor area (sqm)": pl.Float64, "Listing URL": pl.Utf8, "Listing features": pl.List(pl.Utf8), "Listing date": pl.Datetime("us"), "Listing status": pl.Utf8, "Asking price": pl.Int64, "Asking rent (monthly)": pl.Int64, }, ) # Derive asking price per sqm for buy listings if channel == "buy": df = df.with_columns( (pl.col("Asking price") / pl.col("Total floor area (sqm)")) .round(0) .cast(pl.Int32, strict=False) .alias("Asking price per sqm"), ) df.write_parquet(path) log.info("Wrote %d properties to %s", len(df), path)