Small fixes

This commit is contained in:
Andras Schmelczer 2026-03-28 09:29:56 +00:00
parent d93beb9201
commit 7591e5fc05
12 changed files with 198 additions and 14 deletions

View file

@ -5,7 +5,7 @@ from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import normalize_price
from transform import map_property_type, normalize_price
log = logging.getLogger("rightmove")
@ -43,6 +43,19 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
MAX_BEDROOMS,
)
# Re-derive Property type from Property sub-type using current PROPERTY_TYPE_MAP.
# This retroactively fixes data scraped with older versions of the type map.
remapped = 0
for p in properties:
sub_type = p.get("Property sub-type", "")
if sub_type and sub_type != "Unknown":
new_type = map_property_type(sub_type)
if new_type != p.get("Property type"):
p["Property type"] = new_type
remapped += 1
if remapped:
log.info("Re-mapped %d property types from sub-types", remapped)
# Parse first_visible_date to datetime
listing_dates = []
for p in properties:
@ -56,7 +69,27 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
listing_dates.append(dt)
except (ValueError, TypeError):
listing_dates.append(None)
# Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
parsed = None
stripped = fvd.strip()
lower = stripped.lower()
if lower == "today":
parsed = datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0
)
elif lower == "tomorrow":
from datetime import timedelta
parsed = (
datetime.now() + timedelta(days=1)
).replace(hour=0, minute=0, second=0, microsecond=0)
else:
for fmt in ("%d %B, %Y", "%d %B %Y"):
try:
parsed = datetime.strptime(stripped, fmt)
break
except ValueError:
continue
listing_dates.append(parsed)
else:
listing_dates.append(None)