import logging import random import threading import time from dataclasses import dataclass, field import polars as pl from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED from http_client import make_client from metrics import ( scrape_elapsed_seconds, scrape_errors_total, scrape_outcodes_done, scrape_outcodes_total, scrape_properties_total, scrape_state, ) from rightmove import resolve_outcode_id, search_outcode from spatial import PostcodeSpatialIndex from storage import write_parquet log = logging.getLogger("rightmove") @dataclass class ScrapeStatus: state: str = "idle" # idle | running | done | error channel: str = "" outcode: str = "" outcodes_done: int = 0 outcodes_total: int = 0 properties_buy: int = 0 properties_rent: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 status = ScrapeStatus() status_lock = threading.Lock() def _sync_gauges() -> None: """Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock.""" for state in ("idle", "running", "done", "error"): scrape_state.labels(state=state).set(1 if status.state == state else 0) scrape_outcodes_done.set(status.outcodes_done) scrape_outcodes_total.set(status.outcodes_total) scrape_properties_total.labels(channel="buy").set(status.properties_buy) scrape_properties_total.labels(channel="rent").set(status.properties_rent) if status.started_at: end = status.finished_at if status.finished_at else time.time() scrape_elapsed_seconds.set(end - status.started_at) else: scrape_elapsed_seconds.set(0) def load_outcodes() -> list[str]: """Load England-only outcodes from arcgis parquet.""" log.info("Loading outcodes from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001") log.info("England postcodes: %d", len(england)) outcodes = ( england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode")) .drop_nulls() .get_column("outcode") .unique() .sort() .to_list() ) log.info("Unique England outcodes: %d", len(outcodes)) return outcodes def build_postcode_index() -> PostcodeSpatialIndex: """Build spatial index from arcgis England postcodes.""" log.info("Building postcode spatial index from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"]) return PostcodeSpatialIndex( england.get_column("lat").to_list(), england.get_column("long").to_list(), england.get_column("pcd").to_list(), ) def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: """Main scrape loop — runs in background thread.""" global status with status_lock: status.state = "running" status.started_at = time.time() status.errors = [] status.properties_buy = 0 status.properties_rent = 0 _sync_gauges() # Shuffle for geographic diversity shuffled = list(outcodes) random.seed(SEED) random.shuffle(shuffled) client = make_client() try: for channel_cfg in CHANNELS: channel_name = channel_cfg["channel"] file_suffix = "buy" if channel_name == "BUY" else "rent" all_properties: dict[int, dict] = {} # dedup by id with status_lock: status.channel = channel_name status.outcodes_done = 0 status.outcodes_total = len(shuffled) log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)) for i, outcode in enumerate(shuffled): with status_lock: status.outcode = outcode status.outcodes_done = i log.debug("Outcode %s (%d/%d) — %d properties so far", outcode, i + 1, len(shuffled), len(all_properties)) try: outcode_id = resolve_outcode_id(client, outcode) if not outcode_id: log.debug("No Rightmove ID for outcode %s, skipping", outcode) continue props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index) for p in props: pid = p["id"] if pid not in all_properties: all_properties[pid] = p with status_lock: if channel_name == "BUY": status.properties_buy = len(all_properties) else: status.properties_rent = len(all_properties) _sync_gauges() log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties)) except Exception as e: msg = f"Error scraping {outcode}/{channel_name}: {e}" log.error(msg) scrape_errors_total.inc() with status_lock: status.errors.append(msg) if i < len(shuffled) - 1: time.sleep(DELAY_BETWEEN_OUTCODES) # Write parquet deduped = list(all_properties.values()) output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet" write_parquet(deduped, output_path, channel=file_suffix) with status_lock: if channel_name == "BUY": status.properties_buy = len(deduped) else: status.properties_rent = len(deduped) status.outcodes_done = len(shuffled) _sync_gauges() log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped)) with status_lock: status.state = "done" status.finished_at = time.time() _sync_gauges() elapsed = status.finished_at - status.started_at log.info("Scrape complete in %.0fs — buy: %d, rent: %d", elapsed, status.properties_buy, status.properties_rent) except Exception as e: log.exception("Fatal scrape error") with status_lock: status.state = "error" status.errors.append(f"Fatal: {e}") status.finished_at = time.time() _sync_gauges() finally: client.close()