Add OpenRent

This commit is contained in:
Andras Schmelczer 2026-03-12 22:11:29 +00:00
parent 7e92bf112e
commit eae78df3ca
9 changed files with 1178 additions and 34 deletions

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE, SEED
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SEED
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
@ -16,6 +16,7 @@ from metrics import (
cookie_refreshes_total,
cross_source_dedup_total,
homecouk_enabled,
openrent_enabled,
scrape_elapsed_seconds,
scrape_errors_total,
scrape_outcodes_done,
@ -23,6 +24,10 @@ from metrics import (
scrape_properties_total,
scrape_state,
)
from openrent import WafChallengeError
from openrent import load_cookies as load_openrent_cookies
from openrent import make_client as make_openrent_client
from openrent import search_outcode as openrent_search_outcode
from rightmove import resolve_outcode_id, search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
@ -42,6 +47,7 @@ class ScrapeStatus:
# Per-source counts for current channel
rm_properties: int = 0
hk_properties: int = 0
or_properties: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
@ -64,6 +70,7 @@ def _sync_gauges() -> None:
ch = "buy" if status.channel == "BUY" else "rent"
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
scrape_properties_total.labels(channel=ch, source="openrent").set(status.or_properties)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
@ -102,6 +109,23 @@ def build_postcode_index() -> PostcodeSpatialIndex:
)
def build_postcode_coords() -> dict[str, tuple[float, float]]:
"""Build postcode → (lat, lng) lookup from arcgis England postcodes.
Used by OpenRent scraper to resolve coordinates from postcodes."""
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
coords: dict[str, tuple[float, float]] = {}
for pcd, lat, lng in zip(
england.get_column("pcd").to_list(),
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
):
coords[pcd] = (lat, lng)
log.info("Postcode coords lookup: %d postcodes", len(coords))
return coords
def _dedup_key(p: dict) -> tuple:
"""Composite key for cross-source deduplication: (postcode, bedrooms, price).
Two listings on different portals for the same physical property will share
@ -109,9 +133,13 @@ def _dedup_key(p: dict) -> tuple:
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
def run_scrape(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]] | None = None,
) -> None:
"""Main scrape loop — runs in background thread.
Scrapes Rightmove and (if configured) home.co.uk, merging into one dataset."""
Scrapes Rightmove, home.co.uk, and OpenRent, merging into one dataset."""
global status
with status_lock:
status.state = "running"
@ -126,8 +154,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
random.seed(SEED)
random.shuffle(shuffled)
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK:
log.warning("Both SCRAPE_RIGHTMOVE and SCRAPE_HOMECOUK are disabled — nothing to do")
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
log.warning("All scrapers disabled — nothing to do")
with status_lock:
status.state = "done"
status.finished_at = time.time()
@ -154,6 +182,26 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
homecouk_enabled.set(0)
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
or_client = None
or_failed = False
if not SCRAPE_OPENRENT:
log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)")
openrent_enabled.set(0)
else:
or_result = load_openrent_cookies()
or_client = make_openrent_client(*or_result) if or_result else None
if or_client:
log.info("OpenRent scraping ENABLED")
openrent_enabled.set(1)
else:
log.info("OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)")
openrent_enabled.set(0)
# Build postcode coords if OpenRent is active and caller didn't provide them
if or_client and pc_coords is None:
pc_coords = build_postcode_coords()
try:
for channel_cfg in CHANNELS:
channel_name = channel_cfg["channel"]
@ -163,6 +211,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
rm_count = 0 # Rightmove properties this channel
hk_count = 0 # home.co.uk properties this channel
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
or_count = 0 # OpenRent properties this channel
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
with status_lock:
status.channel = channel_name
@ -170,6 +220,7 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
status.outcodes_total = len(shuffled)
status.rm_properties = 0
status.hk_properties = 0
status.or_properties = 0
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
@ -245,6 +296,47 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
with status_lock:
status.errors.append(msg)
# --- OpenRent (RENT channel only) ---
if or_client and not or_failed and channel_name == "RENT":
try:
or_props = openrent_search_outcode(
or_client, outcode, pc_index, pc_coords,
)
for p in or_props:
pid = p["id"]
key = _dedup_key(p)
if pid in all_properties or key in seen_dedup_keys:
or_dedup_count += 1
cross_source_dedup_total.labels(channel="rent").inc()
continue
all_properties[pid] = p
seen_dedup_keys.add(key)
or_count += 1
if or_props:
log.info("OpenRent %s: +%d properties", outcode, len(or_props))
except WafChallengeError:
log.warning("OpenRent WAF cookies expired — attempting refresh via FlareSolverr")
or_client.close()
or_result = load_openrent_cookies()
if or_result:
or_client = make_openrent_client(*or_result)
log.info("OpenRent cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning("Cookie refresh failed, disabling OpenRent for rest of scrape")
or_client = None
or_failed = True
openrent_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append("OpenRent WAF cookies expired and refresh failed")
except Exception as e:
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="openrent").inc()
with status_lock:
status.errors.append(msg)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
@ -252,10 +344,11 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
status.properties_rent = len(all_properties)
status.rm_properties = rm_count
status.hk_properties = hk_count
status.or_properties = or_count
_sync_gauges()
log.info("Outcode %s: total %d (rm: %d, hk: %d)",
outcode, len(all_properties), rm_count, hk_count)
log.info("Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
outcode, len(all_properties), rm_count, hk_count, or_count)
if i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
@ -273,8 +366,11 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
status.outcodes_done = len(shuffled)
_sync_gauges()
log.info("=== %s channel complete: %d unique (rm: %d, hk: %d, cross-dedup: %d) ===",
channel_name, len(deduped), rm_count, hk_count, hk_dedup_count)
log.info(
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
channel_name, len(deduped), rm_count, hk_count, or_count,
hk_dedup_count + or_dedup_count,
)
with status_lock:
status.state = "done"
@ -296,3 +392,5 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
client.close()
if hk_client:
hk_client.close()
if or_client:
or_client.close()