import logging import random import threading import time from dataclasses import dataclass, field import polars as pl import httpx from constants import ( ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, RELOAD_URL, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SEED, ) from homecouk import CookiesExpiredError from homecouk import load_cookies as load_homecouk_cookies from homecouk import make_client as make_homecouk_client from homecouk import search_outcode as homecouk_search_outcode from http_client import make_client from metrics import ( cookie_refreshes_total, cross_source_dedup_total, homecouk_enabled, openrent_enabled, scrape_elapsed_seconds, scrape_errors_total, scrape_outcodes_done, scrape_outcodes_total, scrape_properties_total, scrape_state, ) from openrent import WafChallengeError from openrent import load_cookies as load_openrent_cookies from openrent import make_client as make_openrent_client from openrent import search_outcode as openrent_search_outcode from rightmove import resolve_outcode_id, search_outcode from spatial import PostcodeSpatialIndex from storage import write_parquet log = logging.getLogger("rightmove") @dataclass class ScrapeStatus: state: str = "idle" # idle | running | done | error channel: str = "" outcode: str = "" outcodes_done: int = 0 outcodes_total: int = 0 properties_buy: int = 0 properties_rent: int = 0 # Per-source counts for current channel rm_properties: int = 0 hk_properties: int = 0 or_properties: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 status = ScrapeStatus() status_lock = threading.Lock() def _sync_gauges() -> None: """Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock.""" for state in ("idle", "running", "done", "error"): scrape_state.labels(state=state).set(1 if status.state == state else 0) scrape_outcodes_done.set(status.outcodes_done) scrape_outcodes_total.set(status.outcodes_total) # Total properties (both sources combined) scrape_properties_total.labels(channel="buy", source="total").set( status.properties_buy ) scrape_properties_total.labels(channel="rent", source="total").set( status.properties_rent ) # Per-source breakdown for current channel ch = "buy" if status.channel == "BUY" else "rent" scrape_properties_total.labels(channel=ch, source="rightmove").set( status.rm_properties ) scrape_properties_total.labels(channel=ch, source="homecouk").set( status.hk_properties ) scrape_properties_total.labels(channel=ch, source="openrent").set( status.or_properties ) if status.started_at: end = status.finished_at if status.finished_at else time.time() scrape_elapsed_seconds.set(end - status.started_at) else: scrape_elapsed_seconds.set(0) def load_outcodes() -> list[str]: """Load England-only outcodes from arcgis parquet.""" log.info("Loading outcodes from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001") log.info("England postcodes: %d", len(england)) outcodes = ( england.select( pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode") ) .drop_nulls() .get_column("outcode") .unique() .sort() .to_list() ) log.info("Unique England outcodes: %d", len(outcodes)) return outcodes def build_postcode_index() -> PostcodeSpatialIndex: """Build spatial index from arcgis England postcodes.""" log.info("Building postcode spatial index from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls( subset=["lat", "long"] ) return PostcodeSpatialIndex( england.get_column("lat").to_list(), england.get_column("long").to_list(), england.get_column("pcd").to_list(), ) def build_postcode_coords() -> dict[str, tuple[float, float]]: """Build postcode → (lat, lng) lookup from arcgis England postcodes. Used by OpenRent scraper to resolve coordinates from postcodes.""" log.info("Building postcode coords lookup from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls( subset=["lat", "long"] ) coords: dict[str, tuple[float, float]] = {} for pcd, lat, lng in zip( england.get_column("pcd").to_list(), england.get_column("lat").to_list(), england.get_column("long").to_list(), ): coords[pcd] = (lat, lng) log.info("Postcode coords lookup: %d postcodes", len(coords)) return coords def _fmt_elapsed(seconds: float) -> str: """Format seconds as e.g. '2h13m' or '5m32s'.""" h, rem = divmod(int(seconds), 3600) m, s = divmod(rem, 60) if h: return f"{h}h{m:02d}m" return f"{m}m{s:02d}s" def _dedup_key(p: dict) -> tuple: """Composite key for cross-source deduplication: (postcode, bedrooms, price). Two listings on different portals for the same physical property will share these attributes even though their IDs differ.""" return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0)) def run_scrape( outcodes: list[str], pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]] | None = None, ) -> None: """Main scrape loop — runs in background thread. Scrapes Rightmove, home.co.uk, and OpenRent, merging into one dataset.""" global status with status_lock: status.state = "running" status.started_at = time.time() status.errors = [] status.properties_buy = 0 status.properties_rent = 0 _sync_gauges() # Shuffle for geographic diversity shuffled = list(outcodes) random.seed(SEED) random.shuffle(shuffled) if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT: log.warning("All scrapers disabled — nothing to do") with status_lock: status.state = "done" status.finished_at = time.time() _sync_gauges() return client = make_client() if SCRAPE_RIGHTMOVE else None if not SCRAPE_RIGHTMOVE: log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)") # home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available hk_client = None hk_failed = False if not SCRAPE_HOMECOUK: log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)") homecouk_enabled.set(0) else: hk_result = load_homecouk_cookies() hk_client = make_homecouk_client(*hk_result) if hk_result else None if hk_client: log.info("home.co.uk scraping ENABLED") homecouk_enabled.set(1) else: log.info( "home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)" ) homecouk_enabled.set(0) # OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available or_client = None or_failed = False if not SCRAPE_OPENRENT: log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)") openrent_enabled.set(0) else: or_result = load_openrent_cookies() or_client = make_openrent_client(*or_result) if or_result else None if or_client: log.info("OpenRent scraping ENABLED") openrent_enabled.set(1) else: log.info( "OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)" ) openrent_enabled.set(0) # Build postcode coords if OpenRent is active and caller didn't provide them if or_client and pc_coords is None: pc_coords = build_postcode_coords() try: for channel_cfg in CHANNELS: channel_name = channel_cfg["channel"] file_suffix = "buy" if channel_name == "BUY" else "rent" all_properties: dict[str, dict] = {} # dedup by id seen_dedup_keys: set[tuple] = ( set() ) # cross-source dedup by (postcode, beds, price) rm_count = 0 # Rightmove properties this channel hk_count = 0 # home.co.uk properties this channel hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates or_count = 0 # OpenRent properties this channel or_dedup_count = 0 # OpenRent skipped as cross-source duplicates with status_lock: status.channel = channel_name status.outcodes_done = 0 status.outcodes_total = len(shuffled) status.rm_properties = 0 status.hk_properties = 0 status.or_properties = 0 channel_start = time.time() prev_prop_milestone = 0 # last 10k milestone we logged log.info( "=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled) ) for i, outcode in enumerate(shuffled): with status_lock: status.outcode = outcode status.outcodes_done = i made_requests = False # --- Rightmove --- if SCRAPE_RIGHTMOVE: made_requests = True try: outcode_id = resolve_outcode_id(client, outcode) if not outcode_id: log.debug( "No Rightmove ID for outcode %s, skipping", outcode ) else: props = search_outcode( client, outcode_id, outcode, channel_cfg, pc_index ) for p in props: pid = p["id"] if pid not in all_properties: all_properties[pid] = p seen_dedup_keys.add(_dedup_key(p)) rm_count += 1 except Exception as e: msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}" log.error(msg) scrape_errors_total.labels(source="rightmove").inc() with status_lock: status.errors.append(msg) # --- home.co.uk --- if hk_client and not hk_failed: made_requests = True try: hk_props = homecouk_search_outcode( hk_client, outcode, channel_name, pc_index, ) for p in hk_props: pid = p["id"] key = _dedup_key(p) if pid in all_properties or key in seen_dedup_keys: hk_dedup_count += 1 cross_source_dedup_total.labels( channel="buy" if channel_name == "BUY" else "rent", ).inc() continue all_properties[pid] = p seen_dedup_keys.add(key) hk_count += 1 if hk_props: log.info( "home.co.uk %s: +%d properties", outcode, len(hk_props) ) except CookiesExpiredError: log.warning( "home.co.uk cookies expired — attempting refresh via FlareSolverr" ) hk_client.close() hk_result = load_homecouk_cookies() if hk_result: hk_client = make_homecouk_client(*hk_result) log.info("home.co.uk cookies refreshed, continuing") cookie_refreshes_total.labels(result="success").inc() else: log.warning( "Cookie refresh failed, disabling home.co.uk for rest of scrape" ) hk_client = None hk_failed = True homecouk_enabled.set(0) cookie_refreshes_total.labels(result="failure").inc() with status_lock: status.errors.append( "home.co.uk cookies expired and refresh failed" ) except Exception as e: msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}" log.error(msg) scrape_errors_total.labels(source="homecouk").inc() with status_lock: status.errors.append(msg) # --- OpenRent (RENT channel only) --- if or_client and not or_failed and channel_name == "RENT": made_requests = True try: or_props = openrent_search_outcode( or_client, outcode, pc_index, pc_coords, ) for p in or_props: pid = p["id"] key = _dedup_key(p) if pid in all_properties or key in seen_dedup_keys: or_dedup_count += 1 cross_source_dedup_total.labels(channel="rent").inc() continue all_properties[pid] = p seen_dedup_keys.add(key) or_count += 1 if or_props: log.info( "OpenRent %s: +%d properties", outcode, len(or_props) ) except WafChallengeError: log.warning( "OpenRent WAF cookies expired — attempting refresh via FlareSolverr" ) or_client.close() or_result = load_openrent_cookies() if or_result: or_client = make_openrent_client(*or_result) log.info("OpenRent cookies refreshed, continuing") cookie_refreshes_total.labels(result="success").inc() else: log.warning( "Cookie refresh failed, disabling OpenRent for rest of scrape" ) or_client = None or_failed = True openrent_enabled.set(0) cookie_refreshes_total.labels(result="failure").inc() with status_lock: status.errors.append( "OpenRent WAF cookies expired and refresh failed" ) except Exception as e: msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}" log.error(msg) scrape_errors_total.labels(source="openrent").inc() with status_lock: status.errors.append(msg) with status_lock: if channel_name == "BUY": status.properties_buy = len(all_properties) else: status.properties_rent = len(all_properties) status.rm_properties = rm_count status.hk_properties = hk_count status.or_properties = or_count _sync_gauges() # Log progress every 100 outcodes done = i + 1 elapsed = time.time() - channel_start if done % 100 == 0 or done == len(shuffled): pct = done * 100 // len(shuffled) rate = done / elapsed if elapsed > 0 else 0 log.info( "%s %d/%d (%d%%) — %d props, %s elapsed, %.1f outcodes/min", channel_name, done, len(shuffled), pct, len(all_properties), _fmt_elapsed(elapsed), rate * 60, ) # Log when crossing a 10k property milestone current_milestone = len(all_properties) // 10_000 if current_milestone > prev_prop_milestone: prev_prop_milestone = current_milestone log.info( "%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]", channel_name, current_milestone * 10, rm_count, hk_count, or_count, done, len(shuffled), _fmt_elapsed(elapsed), ) if made_requests and i < len(shuffled) - 1: time.sleep(DELAY_BETWEEN_OUTCODES) # Write parquet deduped = list(all_properties.values()) output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet" write_parquet(deduped, output_path, channel=file_suffix) with status_lock: if channel_name == "BUY": status.properties_buy = len(deduped) else: status.properties_rent = len(deduped) status.outcodes_done = len(shuffled) _sync_gauges() log.info( "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===", channel_name, len(deduped), rm_count, hk_count, or_count, hk_dedup_count + or_dedup_count, ) with status_lock: status.state = "done" status.finished_at = time.time() _sync_gauges() elapsed = status.finished_at - status.started_at log.info( "Scrape complete in %s — buy: %d, rent: %d", _fmt_elapsed(elapsed), status.properties_buy, status.properties_rent, ) # Trigger server data reload if RELOAD_URL: try: log.info("Triggering server reload at %s", RELOAD_URL) resp = httpx.post(RELOAD_URL, timeout=300) if resp.is_success: body = resp.json() log.info( "Server reload complete: %d rows, %d features, %dms", body.get("rows", 0), body.get("features", 0), body.get("elapsed_ms", 0), ) else: log.warning("Server reload failed (%d): %s", resp.status_code, resp.text[:200]) except Exception as e: log.warning("Server reload request failed: %s", e) except Exception as e: log.exception("Fatal scrape error") with status_lock: status.state = "error" status.errors.append(f"Fatal: {e}") status.finished_at = time.time() _sync_gauges() finally: if client: client.close() if hk_client: hk_client.close() if or_client: or_client.close()