import logging import os import re import signal import time from contextlib import contextmanager from pathlib import Path from typing import Iterable import polars as pl from constants import ( ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, LONDON_OUTCODE_PREFIXES, ZOOPLA_DETAIL_BUDGET_FRACTION, ZOOPLA_FETCH_DETAILS, ZOOPLA_FETCHER, ZOOPLA_MAX_DETAILS_PER_OUTCODE, ) from http_client import make_client from onthemarket import search_outcode as onthemarket_search_outcode from rightmove import resolve_outcode_id from rightmove import search_outcode as rightmove_search_outcode from spatial import PostcodeSpatialIndex from storage import write_parquet from zoopla import TurnstileError from zoopla import launch_browser as launch_zoopla_browser from zoopla import search_outcode as zoopla_search_outcode log = logging.getLogger("rightmove") SOURCE_ORDER = ("rightmove", "onthemarket", "zoopla") SALE_CHANNEL = CHANNELS[0] LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES}) OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)") def _arcgis_columns() -> tuple[str, str]: """Return postcode and country column names for supported ARCGIS schemas.""" columns = set(pl.scan_parquet(ARCGIS_PATH).collect_schema().names()) if "pcd" in columns: postcode_col = "pcd" elif "pcds" in columns: postcode_col = "pcds" else: raise ValueError(f"{ARCGIS_PATH} has no supported postcode column") if "ctry" in columns: country_col = "ctry" elif "ctry25cd" in columns: country_col = "ctry25cd" else: raise ValueError(f"{ARCGIS_PATH} has no supported country column") return postcode_col, country_col def _normalize_postcode(postcode: str) -> str: compact = "".join(str(postcode).upper().split()) if len(compact) < 5: return compact return compact[:-3] + " " + compact[-3:] def _londonish_postcode_expr(postcode_col: str) -> pl.Expr: return ( pl.col(postcode_col) .str.to_uppercase() .str.extract(r"^([A-Z]{1,2})", 1) .is_in(LONDON_AREAS) ) def _outcode_area(outcode: str) -> str: chars = [] for ch in outcode.upper(): if not ch.isalpha(): break chars.append(ch) return "".join(chars) def is_londonish_outcode(outcode: str) -> bool: normalized = outcode.upper() return normalized in LONDON_AREAS or _outcode_area(normalized) in LONDON_AREAS def _property_is_londonish(prop: dict) -> bool: postcode = str(prop.get("Postcode") or "").upper().strip() match = OUTCODE_RE.match(postcode) return bool(match and is_londonish_outcode(match.group(1))) def filter_londonish_outcodes(outcodes: Iterable[str]) -> list[str]: return sorted( {outcode.upper() for outcode in outcodes if is_londonish_outcode(outcode)} ) def load_outcodes() -> list[str]: """Load England outcodes from ARCGIS and keep only Greater London-ish areas.""" log.info("Loading outcodes from %s", ARCGIS_PATH) postcode_col, country_col = _arcgis_columns() df = pl.read_parquet(ARCGIS_PATH, columns=[postcode_col, country_col]) england = df.filter( (pl.col(country_col) == "E92000001") & _londonish_postcode_expr(postcode_col) ) outcodes = ( england.select( pl.col(postcode_col) .str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1) .alias("outcode") ) .drop_nulls() .get_column("outcode") .unique() .sort() .to_list() ) londonish = filter_londonish_outcodes(outcodes) log.info("Greater London-ish outcodes: %d", len(londonish)) return londonish def build_postcode_coords() -> dict[str, tuple[float, float]]: """Build postcode -> (lat, lng) lookup from ARCGIS England postcodes.""" log.info("Building postcode coords lookup from %s", ARCGIS_PATH) postcode_col, country_col = _arcgis_columns() df = pl.read_parquet( ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"] ) england = df.filter( (pl.col(country_col) == "E92000001") & _londonish_postcode_expr(postcode_col) ).drop_nulls( subset=["lat", "long"] ) coords: dict[str, tuple[float, float]] = {} for pcd, lat, lng in zip( england.get_column(postcode_col).to_list(), england.get_column("lat").to_list(), england.get_column("long").to_list(), ): coords[_normalize_postcode(pcd)] = (lat, lng) log.info("Postcode coords lookup: %d postcodes", len(coords)) return coords def build_postcode_index() -> PostcodeSpatialIndex: """Build spatial index from ARCGIS England postcodes.""" log.info("Building postcode spatial index from %s", ARCGIS_PATH) postcode_col, country_col = _arcgis_columns() df = pl.read_parquet( ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"] ) england = df.filter( (pl.col(country_col) == "E92000001") & _londonish_postcode_expr(postcode_col) ).drop_nulls( subset=["lat", "long"] ) return PostcodeSpatialIndex( england.get_column("lat").to_list(), england.get_column("long").to_list(), [ _normalize_postcode(pcd) for pcd in england.get_column(postcode_col).to_list() ], ) def _source_names(sources: str | Iterable[str] | None) -> list[str]: if sources is None: return list(SOURCE_ORDER) if isinstance(sources, str): requested = [part.strip().lower() for part in sources.split(",")] else: requested = [str(source).strip().lower() for source in sources] requested = [source for source in requested if source] unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"}) if unknown: raise ValueError(f"Unknown source(s): {', '.join(unknown)}") if "all" in requested: return list(SOURCE_ORDER) return [source for source in SOURCE_ORDER if source in requested] def _dedup_key(prop: dict) -> tuple | None: postcode = str(prop.get("Postcode") or "").strip().upper() price = int(prop.get("price") or 0) if not postcode or price <= 0: return None return (postcode, int(prop.get("Bedrooms") or 0), price) def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]: merged: dict[str, dict] = {} seen_keys: dict[tuple, str] = {} seen_ids: set[str] = set() counts = {source: 0 for source in SOURCE_ORDER} deduped = 0 for source in SOURCE_ORDER: for prop in source_results.get(source, []): key = _dedup_key(prop) prop_id_raw = prop.get("id") prop_id = str(prop_id_raw).strip() if prop_id_raw is not None else None if prop_id: if prop_id in seen_ids: deduped += 1 continue if key is not None: previous_source = seen_keys.get(key) if previous_source is not None and previous_source != source: deduped += 1 continue seen_ids.add(prop_id) storage_key = prop_id else: if key is not None and key in seen_keys: deduped += 1 continue storage_key = f"{source}:{len(merged)}" if key is not None: seen_keys.setdefault(key, source) merged[storage_key] = prop counts[source] += 1 return list(merged.values()), counts, deduped def _source_total( results: dict[str, list[dict]], source: str, ) -> int: return len(results[source]) def _source_remaining( results: dict[str, list[dict]], source: str, max_properties_per_source: int | None, ) -> int | None: if max_properties_per_source is None: return None return max(max_properties_per_source - _source_total(results, source), 0) def _store_properties( results: dict[str, list[dict]], source: str, props: list[dict], max_properties_per_source: int | None, ) -> int: remaining = _source_remaining(results, source, max_properties_per_source) if remaining == 0: return 0 londonish = [prop for prop in props if _property_is_londonish(prop)] dropped_outside_area = len(props) - len(londonish) if dropped_outside_area: log.debug( "%s dropped %d properties outside the Greater London-ish postcode filter", source, dropped_outside_area, ) selected = londonish if remaining is None else londonish[:remaining] results[source].extend(selected) return len(selected) def _exception_detail(exc: BaseException) -> str: detail = " ".join(str(exc).split()) if not detail: detail = repr(exc) if len(detail) > 300: detail = f"{detail[:300]}..." return f"{type(exc).__name__}: {detail}" def _record_error( errors: list[str], source: str, outcode: str, exc: BaseException ) -> None: detail = _exception_detail(exc) message = f"{source} {outcode}: {detail}" errors.append(message) log.warning(message) def _scrape_rightmove( outcodes: list[str], pc_index: PostcodeSpatialIndex, results: dict[str, list[dict]], errors: list[str], max_properties_per_source: int | None, ) -> None: client = make_client() try: for outcode in outcodes: if _source_remaining(results, "rightmove", max_properties_per_source) == 0: log.info("Rightmove cap reached") return try: outcode_id = resolve_outcode_id(client, outcode) except Exception as exc: _record_error(errors, "rightmove", outcode, exc) time.sleep(DELAY_BETWEEN_OUTCODES) continue if not outcode_id: log.debug("No Rightmove outcode ID for %s", outcode) time.sleep(DELAY_BETWEEN_OUTCODES) continue remaining = _source_remaining( results, "rightmove", max_properties_per_source ) if remaining == 0: log.info("Rightmove cap reached") return try: props = rightmove_search_outcode( client, outcode_id, outcode, SALE_CHANNEL, pc_index, max_properties=remaining, ) added = _store_properties( results, "rightmove", props, max_properties_per_source, ) log.info("Rightmove %s: +%d", outcode, added) except Exception as exc: _record_error(errors, "rightmove", outcode, exc) time.sleep(DELAY_BETWEEN_OUTCODES) finally: client.close() class OutcodeTimeout(BaseException): """Raised when a single outcode exceeds the wall-clock budget. Inherits BaseException (not Exception) so the SIGALRM-triggered raise can't be silently swallowed by any of the broad `except Exception:` handlers inside zoopla.py — the signal may fire at any bytecode boundary, including inside those handlers.""" def _zoopla_outcode_timeout_seconds() -> int: raw = os.environ.get("ZOOPLA_OUTCODE_TIMEOUT_SECONDS") if raw is None: return 300 try: timeout = int(raw) except ValueError as exc: raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be an integer") from exc if timeout < 1: raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be greater than zero") return timeout def _zoopla_detail_cap() -> int: """Max detail-page fetches per outcode (0 disables detail fetching). Zoopla search cards only expose an outcode-level address, so the full postcode/coordinates come from each listing's detail page. The cap bounds the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS (the per-outcode SIGALRM budget covers the detail fetches too). Configure via ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py.""" return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0 def _open_zoopla_detail_tab(page, detail_cap: int): """Open a second tab on the same context for detail-page fetches. Sharing the persistent context means the detail tab inherits the search tab's Cloudflare clearance cookies. Returns None when detail fetching is disabled or the tab cannot be created (the scrape then degrades to outcode-level postcodes rather than failing).""" if detail_cap <= 0: return None try: return page.context.new_page() except Exception as exc: log.warning( "Zoopla detail tab unavailable (%s); using outcode-level postcodes", _exception_detail(exc), ) return None @contextmanager def _wall_clock_timeout(seconds: int, label: str): """SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry. Interrupts a hung Playwright IPC by delivering SIGALRM to the main thread; socket waits return EINTR and the handler raises into the caller. The browser is presumed unhealthy afterwards — caller must relaunch it.""" if seconds <= 0: yield return def _handler(signum, frame): raise OutcodeTimeout(f"{label} exceeded {seconds}s budget") old_handler = signal.signal(signal.SIGALRM, _handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) signal.signal(signal.SIGALRM, old_handler) def _launch_zoopla_with_retries(attempts: int = 3): last_error: Exception | None = None for attempt in range(1, attempts + 1): try: return launch_zoopla_browser() except TurnstileError: raise except Exception as exc: last_error = exc log.warning( "Zoopla browser launch failed (%d/%d): %s", attempt, attempts, exc, ) time.sleep(5) assert last_error is not None raise last_error def _close_zoopla_browser(browser, label: str) -> None: try: with _wall_clock_timeout(15, f"{label} browser close"): browser.close() return except (OutcodeTimeout, Exception) as exc: log.warning( "%s browser close failed: %s; force-closing", label, _exception_detail(exc), ) force_close = getattr(browser, "force_close", None) if not callable(force_close): log.warning("%s browser has no force-close hook", label) return try: force_close() except Exception as exc: log.warning("%s browser force-close failed: %s", label, _exception_detail(exc)) def _scrape_zoopla_flaresolverr( outcodes: list[str], pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], results: dict[str, list[dict]], errors: list[str], max_properties_per_source: int | None, ) -> None: """Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC).""" from flaresolverr import FlareSolverrError, FlareSolverrSession from zoopla_flaresolverr import search_outcode as fs_search_outcode try: session = FlareSolverrSession(session="zoopla") session.__enter__() except FlareSolverrError as exc: errors.append(f"zoopla: FlareSolverr unavailable: {exc}") log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc) return try: for outcode in outcodes: remaining = _source_remaining(results, "zoopla", max_properties_per_source) if remaining == 0: log.info("Zoopla cap reached") return try: props, _ = fs_search_outcode( outcode, pc_index, pc_coords, session, max_properties=remaining, detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE, ) added = _store_properties(results, "zoopla", props, max_properties_per_source) log.info("Zoopla %s: +%d", outcode, added) except Exception as exc: # noqa: BLE001 - one outcode must not kill the run _record_error(errors, "zoopla", outcode, exc) time.sleep(DELAY_BETWEEN_OUTCODES) finally: session.__exit__(None, None, None) def _scrape_zoopla( outcodes: list[str], pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], results: dict[str, list[dict]], errors: list[str], max_properties_per_source: int | None, ) -> None: if ZOOPLA_FETCHER == "flaresolverr": _scrape_zoopla_flaresolverr( outcodes, pc_index, pc_coords, results, errors, max_properties_per_source ) return try: browser, page = _launch_zoopla_with_retries() except Exception as exc: errors.append(f"zoopla: browser launch failed: {exc}") log.warning("Zoopla skipped: browser launch failed: %s", exc) return outcode_timeout = _zoopla_outcode_timeout_seconds() detail_cap = _zoopla_detail_cap() detail_page = _open_zoopla_detail_tab(page, detail_cap) # Spend at most a fraction of each outcode's budget on detail fetches so the # SIGALRM guard never trips mid-outcode and discards already-collected # search listings; the rest is left for search pagination and transform. detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION) try: for outcode in outcodes: if _source_remaining(results, "zoopla", max_properties_per_source) == 0: log.info("Zoopla cap reached") return for attempt in range(2): try: with _wall_clock_timeout(outcode_timeout, f"zoopla {outcode}"): props, _ = zoopla_search_outcode( page, outcode, pc_index, pc_coords, max_properties=None, detail_page=detail_page, detail_cap=detail_cap, detail_budget_seconds=detail_budget_seconds, ) added = _store_properties( results, "zoopla", props, max_properties_per_source, ) log.info("Zoopla %s: +%d", outcode, added) break except (OutcodeTimeout, Exception) as exc: if attempt == 1: _record_error(errors, "zoopla", outcode, exc) if isinstance(exc, TurnstileError): return break log.warning( "Zoopla %s attempt %d/2 failed: %s; relaunching browser " "and retrying", outcode, attempt + 1, _exception_detail(exc), ) _close_zoopla_browser(browser, f"zoopla {outcode}") try: browser, page = _launch_zoopla_with_retries() # The old context (and its detail tab) is gone; reopen one. detail_page = _open_zoopla_detail_tab(page, detail_cap) log.info("Zoopla %s retrying with fresh browser", outcode) except Exception as relaunch_exc: _record_error(errors, "zoopla", outcode, relaunch_exc) return time.sleep(DELAY_BETWEEN_OUTCODES) finally: if detail_page is not None: try: detail_page.close() except Exception: pass _close_zoopla_browser(browser, "zoopla final") def _scrape_onthemarket( outcodes: list[str], pc_index: PostcodeSpatialIndex, results: dict[str, list[dict]], errors: list[str], max_properties_per_source: int | None, ) -> None: client = make_client() try: for outcode in outcodes: if ( _source_remaining(results, "onthemarket", max_properties_per_source) == 0 ): log.info("OnTheMarket cap reached") return remaining = _source_remaining( results, "onthemarket", max_properties_per_source ) try: props = onthemarket_search_outcode( client, outcode, pc_index, max_properties=remaining, ) added = _store_properties( results, "onthemarket", props, max_properties_per_source, ) log.info("OnTheMarket %s: +%d", outcode, added) except Exception as exc: _record_error(errors, "onthemarket", outcode, exc) time.sleep(DELAY_BETWEEN_OUTCODES) finally: client.close() def run_scrape( outcodes: list[str], pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]] | None = None, sources: str | Iterable[str] | None = None, output_dir: str | Path | None = None, max_properties_per_source: int | None = None, ) -> dict: """Run one manual sale-listings scrape and write a parquet output.""" selected_sources = _source_names(sources) selected_outcodes = filter_londonish_outcodes(outcodes) if not selected_sources: raise ValueError("No sources selected") if not selected_outcodes: raise ValueError("No Greater London-ish outcodes selected") output_base = Path(output_dir) if output_dir is not None else DATA_DIR output_base.mkdir(parents=True, exist_ok=True) errors: list[str] = [] results = {source: [] for source in SOURCE_ORDER} started_at = time.time() log.info( "Starting manual sale scrape: %d outcodes, sources=%s, source_cap=%s", len(selected_outcodes), ",".join(selected_sources), max_properties_per_source, ) if "rightmove" in selected_sources: _scrape_rightmove( selected_outcodes, pc_index, results, errors, max_properties_per_source, ) if "onthemarket" in selected_sources: _scrape_onthemarket( selected_outcodes, pc_index, results, errors, max_properties_per_source, ) if "zoopla" in selected_sources: if pc_coords is None: pc_coords = build_postcode_coords() _scrape_zoopla( selected_outcodes, pc_index, pc_coords, results, errors, max_properties_per_source, ) merged, source_counts, deduped = _merge_properties(results) output_path = output_base / "online_listings_buy.parquet" if merged: write_parquet(merged, output_path) else: if output_path.exists(): output_path.unlink() log.warning("No London-ish properties to write to %s", output_path) counts = { "total": len(merged), "deduped": deduped, "sources": source_counts, } source_summary = " ".join( f"{source}:{source_counts[source]}" for source in SOURCE_ORDER ) log.info( "Sale scrape complete: %d unique (%s deduped:%d)", len(merged), source_summary, deduped, ) return { "outcodes": len(selected_outcodes), "sources": selected_sources, "source_totals": { source: _source_total(results, source) for source in selected_sources }, "counts": counts, "path": str(output_path), "errors": errors, "elapsed_seconds": round(time.time() - started_at, 3), }