perfect-postcode/finder/storage.py
2026-05-17 10:16:30 +01:00

150 lines
5.9 KiB
Python

import logging
from datetime import datetime
from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS
from transform import map_property_type, normalize_postcode
log = logging.getLogger("rightmove")
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write sale properties list to parquet with server-ready column names."""
if not properties:
log.warning("No properties to write to %s", path)
return
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
bad_count = 0
for p in properties:
for key in ("Bedrooms", "Bathrooms"):
val = p.get(key, 0) or 0
if val > MAX_BEDROOMS:
bad_count += 1
p[key] = None
# Recompute derived field after sanitization
beds = p.get("Bedrooms")
baths = p.get("Bathrooms")
if beds is None or baths is None:
p["Number of bedrooms & living rooms"] = None
else:
p["Number of bedrooms & living rooms"] = beds + baths
if bad_count:
log.warning(
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
bad_count,
MAX_BEDROOMS,
)
# Re-derive Property type from Property sub-type using current PROPERTY_TYPE_MAP.
# This retroactively fixes data scraped with older versions of the type map.
remapped = 0
for p in properties:
sub_type = p.get("Property sub-type", "")
if sub_type and sub_type != "Unknown":
new_type = map_property_type(sub_type)
if new_type != p.get("Property type"):
p["Property type"] = new_type
remapped += 1
if remapped:
log.info("Re-mapped %d property types from sub-types", remapped)
# Parse first_visible_date to datetime
listing_dates = []
for p in properties:
fvd = p.get("first_visible_date", "")
if fvd:
try:
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
# Convert to UTC naive datetime for consistent storage
if dt.tzinfo is not None:
from datetime import timezone
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
listing_dates.append(dt)
except (ValueError, TypeError):
# Try additional date formats used by scraped listing sources.
parsed = None
stripped = fvd.strip()
lower = stripped.lower()
if lower == "today":
parsed = datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0
)
elif lower == "tomorrow":
from datetime import timedelta
parsed = (
datetime.now() + timedelta(days=1)
).replace(hour=0, minute=0, second=0, microsecond=0)
else:
for fmt in ("%d %B, %Y", "%d %B %Y"):
try:
parsed = datetime.strptime(stripped, fmt)
break
except ValueError:
continue
listing_dates.append(parsed)
else:
listing_dates.append(None)
# Zero prices indicate parsing failures or POA/auction listings — treat as null
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
listing_statuses = ["For sale"] * len(properties)
df = pl.DataFrame(
{
"Bedrooms": [p["Bedrooms"] for p in properties],
"Bathrooms": [p["Bathrooms"] for p in properties],
"Number of bedrooms & living rooms": [
p["Number of bedrooms & living rooms"] for p in properties
],
"lon": [p["lon"] for p in properties],
"lat": [p["lat"] for p in properties],
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
"Address per Property Register": [
p["Address per Property Register"] for p in properties
],
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
"Property type": [p["Property type"] for p in properties],
"Property sub-type": [p["Property sub-type"] for p in properties],
"Price qualifier": [p["Price qualifier"] for p in properties],
"Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties],
"Listing URL": [p["Listing URL"] for p in properties],
"Listing features": [p["Listing features"] for p in properties],
"Listing date": listing_dates,
"Listing status": listing_statuses,
"Asking price": asking_prices,
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
},
)
df = df.with_columns(
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)