import json import logging import random import threading import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field import polars as pl import httpx from constants import ( ARCGIS_PATH, CHANNELS, CHECKPOINT_INTERVAL, DATA_DIR, DELAY_BETWEEN_OUTCODES, HOMECOUK_CONCURRENCY, RELOAD_URL, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SCRAPE_ZOOPLA, SEED, ) from homecouk import CookiesExpiredError from homecouk import load_cookies as load_homecouk_cookies from homecouk import make_client as make_homecouk_client from homecouk import search_outcode as homecouk_search_outcode from http_client import make_client from metrics import ( cookie_refreshes_total, cross_source_dedup_total, homecouk_enabled, openrent_enabled, scrape_elapsed_seconds, scrape_errors_total, scrape_outcodes_done, scrape_outcodes_total, scrape_properties_total, scrape_state, zoopla_enabled, ) from openrent import WafChallengeError from openrent import load_cookies as load_openrent_cookies from openrent import make_client as make_openrent_client from openrent import search_outcode as openrent_search_outcode from rightmove import resolve_outcode_id, search_outcode from zoopla import TurnstileError from zoopla import launch_browser as launch_zoopla_browser from zoopla import search_outcode as zoopla_search_outcode from spatial import PostcodeSpatialIndex from storage import write_parquet log = logging.getLogger("rightmove") @dataclass class ScrapeStatus: state: str = "idle" # idle | running | done | error channel: str = "" outcode: str = "" outcodes_done: int = 0 outcodes_total: int = 0 properties_buy: int = 0 properties_rent: int = 0 # Per-source counts (combined across channels) rm_properties: int = 0 hk_properties: int = 0 or_properties: int = 0 zp_properties: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 status = ScrapeStatus() status_lock = threading.Lock() def _sync_gauges() -> None: """Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock.""" for state in ("idle", "running", "done", "error"): scrape_state.labels(state=state).set(1 if status.state == state else 0) scrape_outcodes_done.set(status.outcodes_done) scrape_outcodes_total.set(status.outcodes_total) scrape_properties_total.labels(channel="buy", source="total").set( status.properties_buy ) scrape_properties_total.labels(channel="rent", source="total").set( status.properties_rent ) # Per-source totals (across both channels) for ch in ("buy", "rent"): scrape_properties_total.labels(channel=ch, source="rightmove").set( status.rm_properties ) scrape_properties_total.labels(channel=ch, source="homecouk").set( status.hk_properties ) scrape_properties_total.labels(channel=ch, source="openrent").set( status.or_properties ) scrape_properties_total.labels(channel=ch, source="zoopla").set( status.zp_properties ) if status.started_at: end = status.finished_at if status.finished_at else time.time() scrape_elapsed_seconds.set(end - status.started_at) else: scrape_elapsed_seconds.set(0) def load_outcodes() -> list[str]: """Load England-only outcodes from arcgis parquet.""" log.info("Loading outcodes from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001") log.info("England postcodes: %d", len(england)) outcodes = ( england.select( pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode") ) .drop_nulls() .get_column("outcode") .unique() .sort() .to_list() ) log.info("Unique England outcodes: %d", len(outcodes)) return outcodes def build_postcode_index() -> PostcodeSpatialIndex: """Build spatial index from arcgis England postcodes.""" log.info("Building postcode spatial index from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls( subset=["lat", "long"] ) return PostcodeSpatialIndex( england.get_column("lat").to_list(), england.get_column("long").to_list(), england.get_column("pcd").to_list(), ) def build_postcode_coords() -> dict[str, tuple[float, float]]: """Build postcode → (lat, lng) lookup from arcgis England postcodes. Used by OpenRent scraper to resolve coordinates from postcodes.""" log.info("Building postcode coords lookup from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls( subset=["lat", "long"] ) coords: dict[str, tuple[float, float]] = {} for pcd, lat, lng in zip( england.get_column("pcd").to_list(), england.get_column("lat").to_list(), england.get_column("long").to_list(), ): coords[pcd] = (lat, lng) log.info("Postcode coords lookup: %d postcodes", len(coords)) return coords def _fmt_elapsed(seconds: float) -> str: """Format seconds as e.g. '2h13m' or '5m32s'.""" h, rem = divmod(int(seconds), 3600) m, s = divmod(rem, 60) if h: return f"{h}h{m:02d}m" return f"{m}m{s:02d}s" def _dedup_key(p: dict) -> tuple: """Composite key for cross-source deduplication: (postcode, bedrooms, price). Two listings on different portals for the same physical property will share these attributes even though their IDs differ.""" return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0)) class _Progress: """Thread-safe progress tracker for parallel source workers.""" def __init__(self): self._counts: dict[str, int] = {} self._lock = threading.Lock() def update(self, source: str, done: int) -> None: with self._lock: self._counts[source] = done def snapshot(self) -> dict[str, int]: with self._lock: return dict(self._counts) def _merge_channel( rm_props: list[dict], hk_props: list[dict], or_props: list[dict], zp_props: list[dict], ) -> tuple[dict[str, dict], dict[str, int], int]: """Merge properties from all sources for one channel with cross-source dedup. Rightmove has priority; other sources are checked for duplicates. Returns (all_properties_by_id, per_source_counts, total_dedup_count). """ all_properties: dict[str, dict] = {} seen_keys: set[tuple] = set() counts = {"rm": 0, "hk": 0, "or": 0, "zp": 0} total_dedup = 0 # Rightmove first (priority source) for p in rm_props: pid = p["id"] if pid not in all_properties: all_properties[pid] = p seen_keys.add(_dedup_key(p)) counts["rm"] += 1 # Other sources (check for cross-source duplicates) for source, props in [("hk", hk_props), ("or", or_props), ("zp", zp_props)]: for p in props: pid = p["id"] key = _dedup_key(p) if pid in all_properties or key in seen_keys: total_dedup += 1 continue all_properties[pid] = p seen_keys.add(key) counts[source] += 1 return all_properties, counts, total_dedup # --------------------------------------------------------------------------- # Checkpointing — save/resume partial results across crashes # --------------------------------------------------------------------------- def _checkpoint_meta_path(): return DATA_DIR / "checkpoint.json" def _checkpoint_results_path(source: str, channel: str): return DATA_DIR / f"checkpoint_{source}_{channel}.json" def _save_checkpoint( shuffled: list[str], progress: _Progress, source_results: dict[str, dict[str, list]], active_sources: list[str], ) -> None: """Save per-source progress indices and partial results to disk. Writes atomically (temp + rename) so a crash mid-write leaves the previous checkpoint intact. """ snap = progress.snapshot() meta = { "seed": SEED, "num_outcodes": len(shuffled), "sources": {s: snap.get(s, 0) for s in active_sources}, "timestamp": time.time(), } # Write result files per source per channel for source in active_sources: results = source_results.get(source, {}) for ch_key in ("BUY", "RENT"): props = results.get(ch_key, []) path = _checkpoint_results_path(source, ch_key.lower()) tmp = path.with_suffix(".tmp") try: with open(tmp, "w") as f: json.dump(props, f, default=str) tmp.rename(path) except Exception as e: log.warning("Failed to write checkpoint %s: %s", path.name, e) # Write metadata atomically tmp = _checkpoint_meta_path().with_suffix(".tmp") try: with open(tmp, "w") as f: json.dump(meta, f) tmp.rename(_checkpoint_meta_path()) except Exception as e: log.warning("Failed to write checkpoint metadata: %s", e) return total = sum(len(source_results.get(s, {}).get(ch, [])) for s in active_sources for ch in ("BUY", "RENT")) log.info( "Checkpoint saved: %s (%d properties)", {s: snap.get(s, 0) for s in active_sources}, total, ) def _load_checkpoint( shuffled: list[str], ) -> tuple[dict[str, int], dict[str, dict[str, list]]] | None: """Load checkpoint if it exists and matches the current outcode list. Returns (start_indices, loaded_results) or None if no valid checkpoint. """ path = _checkpoint_meta_path() if not path.exists(): return None try: with open(path) as f: meta = json.load(f) except Exception: log.warning("Checkpoint file corrupt, starting fresh") _clear_checkpoint() return None if meta.get("seed") != SEED or meta.get("num_outcodes") != len(shuffled): log.info("Checkpoint from different run configuration, discarding") _clear_checkpoint() return None start_indices: dict[str, int] = {} loaded_results: dict[str, dict[str, list]] = {} for source, completed in meta.get("sources", {}).items(): start_indices[source] = completed loaded_results[source] = {"BUY": [], "RENT": []} for channel in ("buy", "rent"): rpath = _checkpoint_results_path(source, channel) if rpath.exists(): try: with open(rpath) as f: loaded_results[source][channel.upper()] = json.load(f) except Exception: log.warning( "Checkpoint results for %s/%s corrupt, restarting %s", source, channel, source, ) start_indices[source] = 0 loaded_results[source] = {"BUY": [], "RENT": []} break elapsed_since = time.time() - meta.get("timestamp", 0) log.info( "Resuming from checkpoint (saved %.0fm ago): %s", elapsed_since / 60, start_indices, ) return start_indices, loaded_results def _clear_checkpoint() -> None: """Remove all checkpoint files after successful completion.""" for path in DATA_DIR.glob("checkpoint*"): try: path.unlink() except Exception: pass def run_scrape( outcodes: list[str], pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]] | None = None, ) -> None: """Main scrape orchestrator — runs all sources in parallel threads. Each source (Rightmove, home.co.uk, OpenRent, Zoopla) gets its own thread that iterates all outcodes for both BUY and RENT channels. Results are merged with cross-source deduplication after all workers complete. """ global status with status_lock: status.state = "running" status.started_at = time.time() status.finished_at = 0.0 status.errors = [] status.properties_buy = 0 status.properties_rent = 0 status.channel = "" status.outcode = "" _sync_gauges() shuffled = list(outcodes) random.seed(SEED) random.shuffle(shuffled) if not any([SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_ZOOPLA]): log.warning("All scrapers disabled — nothing to do") with status_lock: status.state = "done" status.finished_at = time.time() _sync_gauges() return if not SCRAPE_RIGHTMOVE: log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)") if not SCRAPE_HOMECOUK: log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)") homecouk_enabled.set(0) if not SCRAPE_OPENRENT: log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)") openrent_enabled.set(0) if not SCRAPE_ZOOPLA: log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)") zoopla_enabled.set(0) # Build postcode coords if needed for OpenRent/Zoopla if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) and pc_coords is None: pc_coords = build_postcode_coords() # Per-source result containers: {channel_name: [properties]} # Each list is only written by its owning source thread. rm_results: dict[str, list] = {"BUY": [], "RENT": []} hk_results: dict[str, list] = {"BUY": [], "RENT": []} or_results: dict[str, list] = {"BUY": [], "RENT": []} zp_results: dict[str, list] = {"BUY": [], "RENT": []} progress = _Progress() # --- Resume from checkpoint if available --- start_indices: dict[str, int] = {} checkpoint = _load_checkpoint(shuffled) if checkpoint: start_indices, loaded = checkpoint source_to_results = {"rm": rm_results, "hk": hk_results, "or": or_results, "zp": zp_results} for src, data in loaded.items(): if src in source_to_results: for ch in ("BUY", "RENT"): source_to_results[src][ch] = data.get(ch, []) # Reassign in case references changed rm_results = source_to_results["rm"] hk_results = source_to_results["hk"] or_results = source_to_results["or"] zp_results = source_to_results["zp"] # Pre-set progress for resumed sources for src, idx in start_indices.items(): if idx > 0: progress.update(src, idx) # --- Source worker closures --- # Each worker owns its client lifecycle and iterates all outcodes for both # channels. On auth failure, it refreshes cookies and continues. On fatal # failure, it marks itself as done and returns partial results. def rm_worker(): rm_start = start_indices.get("rm", 0) if rm_start > 0: log.info("Rightmove resuming from outcode %d/%d", rm_start, len(shuffled)) client = make_client() try: for i, outcode in enumerate(shuffled): if i < rm_start: continue try: outcode_id = resolve_outcode_id(client, outcode) except Exception as e: log.error("Rightmove %s ID lookup: %s", outcode, e) scrape_errors_total.labels(source="rightmove").inc() progress.update("rm", i + 1) time.sleep(DELAY_BETWEEN_OUTCODES) continue if not outcode_id: log.debug("No Rightmove ID for %s, skipping", outcode) progress.update("rm", i + 1) time.sleep(DELAY_BETWEEN_OUTCODES) continue for ch_cfg in CHANNELS: ch = ch_cfg["channel"] try: props = search_outcode( client, outcode_id, outcode, ch_cfg, pc_index ) rm_results[ch].extend(props) except Exception as e: log.error("Rightmove %s/%s: %s", outcode, ch, e) scrape_errors_total.labels(source="rightmove").inc() progress.update("rm", i + 1) time.sleep(DELAY_BETWEEN_OUTCODES) except Exception as e: log.exception("Fatal Rightmove error: %s", e) with status_lock: status.errors.append(f"Fatal Rightmove: {e}") finally: client.close() def hk_worker(): hk_result = load_homecouk_cookies() if not hk_result: log.info("home.co.uk DISABLED (no cookies available)") homecouk_enabled.set(0) progress.update("hk", len(shuffled)) return hk_start = start_indices.get("hk", 0) if hk_start > 0: log.info("home.co.uk resuming from outcode %d/%d", hk_start, len(shuffled)) log.info( "home.co.uk scraping ENABLED (concurrency=%d)", HOMECOUK_CONCURRENCY ) homecouk_enabled.set(1) # Shared state across pool threads cookie_state = { "cookies": hk_result[0], "user_agent": hk_result[1], "generation": 0, } cookie_lock = threading.Lock() results_lock = threading.Lock() completed_count = [hk_start] disabled = [False] _local = threading.local() def _get_client(): """Get or create a thread-local curl_cffi session.""" with cookie_lock: gen = cookie_state["generation"] cookies = cookie_state["cookies"] ua = cookie_state["user_agent"] if not hasattr(_local, "client") or _local.gen != gen: if hasattr(_local, "client"): try: _local.client.close() except Exception: pass _local.client = make_homecouk_client(cookies, ua) _local.gen = gen return _local.client def _refresh_cookies(): """Refresh cookies via FlareSolverr. Thread-safe with generation check.""" with cookie_lock: pre_gen = cookie_state["generation"] new = load_homecouk_cookies() if not new: return False with cookie_lock: if cookie_state["generation"] == pre_gen: cookie_state["cookies"] = new[0] cookie_state["user_agent"] = new[1] cookie_state["generation"] += 1 cookie_refreshes_total.labels(result="success").inc() log.info("home.co.uk cookies refreshed") return True def _scrape_outcode(outcode): if disabled[0]: return client = _get_client() for ch_cfg in CHANNELS: ch = ch_cfg["channel"] if disabled[0]: return try: props = homecouk_search_outcode( client, outcode, ch, pc_index ) if props: with results_lock: hk_results[ch].extend(props) log.info( "home.co.uk %s: +%d properties", outcode, len(props) ) except CookiesExpiredError: log.warning( "home.co.uk cookies expired — attempting refresh" ) if _refresh_cookies(): client = _get_client() try: props = homecouk_search_outcode( client, outcode, ch, pc_index ) if props: with results_lock: hk_results[ch].extend(props) log.info( "home.co.uk %s: +%d properties", outcode, len(props), ) except Exception as e: log.error( "home.co.uk %s/%s (after refresh): %s", outcode, ch, e, ) scrape_errors_total.labels(source="homecouk").inc() else: log.warning( "Cookie refresh failed, disabling home.co.uk" ) disabled[0] = True homecouk_enabled.set(0) cookie_refreshes_total.labels(result="failure").inc() with status_lock: status.errors.append( "home.co.uk cookies expired and refresh failed" ) return except Exception as e: log.error("home.co.uk %s/%s: %s", outcode, ch, e) scrape_errors_total.labels(source="homecouk").inc() with results_lock: completed_count[0] += 1 progress.update("hk", completed_count[0]) time.sleep(DELAY_BETWEEN_OUTCODES) try: work = [oc for i, oc in enumerate(shuffled) if i >= hk_start] with ThreadPoolExecutor( max_workers=HOMECOUK_CONCURRENCY ) as pool: list(pool.map(_scrape_outcode, work)) except Exception as e: log.exception("Fatal home.co.uk error: %s", e) with status_lock: status.errors.append(f"Fatal home.co.uk: {e}") if disabled[0]: progress.update("hk", len(shuffled)) def or_worker(): or_result = load_openrent_cookies() if not or_result: log.info("OpenRent DISABLED (no cookies available)") openrent_enabled.set(0) progress.update("or", len(shuffled)) return or_start = start_indices.get("or", 0) if or_start > 0: log.info("OpenRent resuming from outcode %d/%d", or_start, len(shuffled)) client = make_openrent_client(*or_result) log.info("OpenRent scraping ENABLED") openrent_enabled.set(1) try: for i, outcode in enumerate(shuffled): if i < or_start: continue # OpenRent is RENT-only try: props = openrent_search_outcode( client, outcode, pc_index, pc_coords ) or_results["RENT"].extend(props) if props: log.info("OpenRent %s: +%d properties", outcode, len(props)) except WafChallengeError: log.warning( "OpenRent WAF cookies expired — attempting refresh" ) client.close() or_new = load_openrent_cookies() if or_new: client = make_openrent_client(*or_new) log.info("OpenRent cookies refreshed, continuing") cookie_refreshes_total.labels(result="success").inc() else: log.warning( "Cookie refresh failed, disabling OpenRent" ) openrent_enabled.set(0) cookie_refreshes_total.labels(result="failure").inc() with status_lock: status.errors.append( "OpenRent WAF cookies expired and refresh failed" ) progress.update("or", len(shuffled)) return except Exception as e: log.error("OpenRent %s: %s", outcode, e) scrape_errors_total.labels(source="openrent").inc() progress.update("or", i + 1) time.sleep(DELAY_BETWEEN_OUTCODES) except Exception as e: log.exception("Fatal OpenRent error: %s", e) with status_lock: status.errors.append(f"Fatal OpenRent: {e}") finally: try: client.close() except Exception: pass def zp_worker(): try: browser, page = launch_zoopla_browser() log.info("Zoopla scraping ENABLED (Camoufox browser launched)") zoopla_enabled.set(1) except TurnstileError: log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla") zoopla_enabled.set(0) progress.update("zp", len(shuffled)) return except Exception as e: log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e) zoopla_enabled.set(0) progress.update("zp", len(shuffled)) return zp_start = start_indices.get("zp", 0) if zp_start > 0: log.info("Zoopla resuming from outcode %d/%d", zp_start, len(shuffled)) try: for i, outcode in enumerate(shuffled): if i < zp_start: continue search_url = None for ch_cfg in CHANNELS: ch = ch_cfg["channel"] # Build direct URL for second channel by swapping path direct_url = None if search_url: if ch == "BUY": direct_url = search_url.replace("/to-rent/", "/for-sale/") else: direct_url = search_url.replace("/for-sale/", "/to-rent/") try: props, result_url = zoopla_search_outcode( page, outcode, ch, pc_index, pc_coords, base_search_url=direct_url, ) if result_url: search_url = result_url zp_results[ch].extend(props) if props: log.info("Zoopla %s: +%d properties", outcode, len(props)) except TurnstileError: log.warning( "Zoopla Turnstile challenge — relaunching browser" ) try: browser.close() except Exception: pass try: browser, page = launch_zoopla_browser() log.info("Zoopla browser relaunched, continuing") except Exception: log.warning( "Browser relaunch failed, disabling Zoopla" ) zoopla_enabled.set(0) with status_lock: status.errors.append( "Zoopla Cloudflare challenge failed and relaunch failed" ) progress.update("zp", len(shuffled)) return except Exception as e: log.error("Zoopla %s/%s: %s", outcode, ch, e) scrape_errors_total.labels(source="zoopla").inc() progress.update("zp", i + 1) time.sleep(DELAY_BETWEEN_OUTCODES) except Exception as e: log.exception("Fatal Zoopla error: %s", e) with status_lock: status.errors.append(f"Fatal Zoopla: {e}") finally: try: browser.close() except Exception: pass # --- Launch worker threads --- active_sources: list[str] = [] threads: list[threading.Thread] = [] if SCRAPE_RIGHTMOVE: threads.append(threading.Thread(target=rm_worker, name="scrape-rm", daemon=True)) active_sources.append("rm") if SCRAPE_HOMECOUK: threads.append(threading.Thread(target=hk_worker, name="scrape-hk", daemon=True)) active_sources.append("hk") if SCRAPE_OPENRENT: threads.append(threading.Thread(target=or_worker, name="scrape-or", daemon=True)) active_sources.append("or") if SCRAPE_ZOOPLA: threads.append(threading.Thread(target=zp_worker, name="scrape-zp", daemon=True)) active_sources.append("zp") log.info( "=== Starting scrape: %d outcodes, sources: %s ===", len(shuffled), ", ".join(active_sources), ) for t in threads: t.start() # --- Monitor progress while workers run --- # Map source names to result dicts for checkpointing source_results_map = { "rm": rm_results, "hk": hk_results, "or": or_results, "zp": zp_results, } scrape_start = time.time() last_log = 0.0 last_checkpoint = time.time() try: while any(t.is_alive() for t in threads): snap = progress.snapshot() min_done = min( (snap.get(s, 0) for s in active_sources), default=0 ) # Count properties across sources (safe: only one thread writes each list) total_buy = sum( len(r["BUY"]) for r in [rm_results, hk_results, or_results, zp_results] ) total_rent = sum( len(r["RENT"]) for r in [rm_results, hk_results, or_results, zp_results] ) with status_lock: status.outcodes_done = min_done status.outcodes_total = len(shuffled) status.properties_buy = total_buy status.properties_rent = total_rent status.rm_properties = len(rm_results["BUY"]) + len(rm_results["RENT"]) status.hk_properties = len(hk_results["BUY"]) + len(hk_results["RENT"]) status.or_properties = len(or_results["RENT"]) status.zp_properties = len(zp_results["BUY"]) + len(zp_results["RENT"]) _sync_gauges() now = time.time() # Log progress every 30 seconds if now - last_log >= 30: elapsed = now - scrape_start per_source = ", ".join( f"{s}:{snap.get(s, 0)}" for s in active_sources ) log.info( "Progress: %d/%d outcodes (%s), %d buy + %d rent props, %s elapsed", min_done, len(shuffled), per_source, total_buy, total_rent, _fmt_elapsed(elapsed), ) last_log = now # Save checkpoint periodically if now - last_checkpoint >= CHECKPOINT_INTERVAL: try: _save_checkpoint( shuffled, progress, source_results_map, active_sources, ) except Exception as e: log.warning("Checkpoint save failed: %s", e) last_checkpoint = now time.sleep(5) except Exception as e: log.exception("Monitor loop error: %s", e) # Save final checkpoint before joining (in case merge/write fails) try: _save_checkpoint(shuffled, progress, source_results_map, active_sources) except Exception: pass for t in threads: t.join() log.info("All source workers completed") # --- Merge results per channel and write parquet --- try: for ch_cfg in CHANNELS: ch = ch_cfg["channel"] file_suffix = "buy" if ch == "BUY" else "rent" merged, counts, total_dedup = _merge_channel( rm_results[ch], hk_results[ch], or_results[ch], zp_results[ch], ) # Update cross-source dedup counter ch_label = "buy" if ch == "BUY" else "rent" if total_dedup: cross_source_dedup_total.labels(channel=ch_label).inc(total_dedup) deduped = list(merged.values()) output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet" write_parquet(deduped, output_path, channel=file_suffix) with status_lock: if ch == "BUY": status.properties_buy = len(deduped) else: status.properties_rent = len(deduped) _sync_gauges() log.info( "=== %s complete: %d unique (rm:%d hk:%d or:%d zp:%d, cross-dedup:%d) ===", ch, len(deduped), counts["rm"], counts["hk"], counts["or"], counts["zp"], total_dedup, ) # Scrape completed successfully — clear checkpoint _clear_checkpoint() with status_lock: status.state = "done" status.finished_at = time.time() status.outcodes_done = len(shuffled) _sync_gauges() elapsed = status.finished_at - status.started_at log.info( "Scrape complete in %s — buy: %d, rent: %d", _fmt_elapsed(elapsed), status.properties_buy, status.properties_rent, ) # Trigger server data reload if RELOAD_URL: try: log.info("Triggering server reload at %s", RELOAD_URL) resp = httpx.post(RELOAD_URL, timeout=300) if resp.is_success: body = resp.json() log.info( "Server reload complete: %d rows, %d features, %dms", body.get("rows", 0), body.get("features", 0), body.get("elapsed_ms", 0), ) else: log.warning( "Server reload failed (%d): %s", resp.status_code, resp.text[:200], ) except Exception as e: log.warning("Server reload request failed: %s", e) except Exception as e: log.exception("Fatal scrape error during merge/write") with status_lock: status.state = "error" status.errors.append(f"Fatal: {e}") status.finished_at = time.time() _sync_gauges()