"""Zoopla (zoopla.co.uk) scraper — sale properties. Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which blocks non-browser HTTP clients and even Playwright with stealth patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably. Zoopla uses Next.js App Router with React Server Components (RSC). Search result data is server-rendered in an RSC stream, not available via __NEXT_DATA__ or a JSON API. Architecture: Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps a single Camoufox browser alive for the entire scrape. For each outcode, it: 1. Navigates directly to the sale search URL 2. Extracts listing data from the rendered DOM 3. Handles pagination via ?pn=N parameter The browser session replaces the cookie/client pattern used by other scrapers. """ import logging import os import re import signal import sys import time from pathlib import Path from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse import httpx from constants import ( DATA_DIR, DELAY_BETWEEN_PAGES, GLUETUN_API_KEY, GLUETUN_CONTROL_URL, GLUETUN_MAX_ROTATIONS, GLUETUN_PROXY, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE, ZOOPLA_DETAIL_GOTO_TIMEOUT_MS, ) from spatial import PostcodeSpatialIndex from transform import ( build_register_address, extract_full_postcode, extract_outcode, fix_coords, normalize_sub_type, parse_int_value, resolve_listing_postcode, validate_floor_area, ) log = logging.getLogger("zoopla") class TurnstileError(Exception): """Raised when Cloudflare Turnstile challenge cannot be passed.""" def _pid_exists(pid: int) -> bool: try: os.kill(pid, 0) except ProcessLookupError: return False except PermissionError: return True return True def _proc_ppid(pid: int) -> int | None: try: for line in Path(f"/proc/{pid}/status").read_text().splitlines(): if line.startswith("PPid:"): return int(line.split()[1]) except (OSError, ValueError): return None return None def _proc_descendants(root_pid: int) -> set[int]: proc_root = Path("/proc") if not proc_root.exists(): return set() children: dict[int, list[int]] = {} for path in proc_root.iterdir(): if not path.name.isdigit(): continue pid = int(path.name) ppid = _proc_ppid(pid) if ppid is not None: children.setdefault(ppid, []).append(pid) descendants: set[int] = set() stack = list(children.get(root_pid, [])) while stack: pid = stack.pop() if pid in descendants: continue descendants.add(pid) stack.extend(children.get(pid, [])) return descendants def _terminate_process_tree(root_pid: int, label: str) -> None: if root_pid <= 0 or root_pid == os.getpid(): return pids = _proc_descendants(root_pid) | {root_pid} for sig, sig_name, delay in ( (signal.SIGTERM, "SIGTERM", 1.0), (signal.SIGKILL, "SIGKILL", 0.5), ): alive = [pid for pid in sorted(pids, reverse=True) if _pid_exists(pid)] if not alive: return log.warning("%s: sending %s to %d process(es)", label, sig_name, len(alive)) for pid in alive: try: os.kill(pid, sig) except ProcessLookupError: pass except OSError as exc: log.debug("%s: could not signal pid %d: %s", label, pid, exc) time.sleep(delay) alive = [pid for pid in sorted(pids) if _pid_exists(pid)] if alive: log.warning("%s: process(es) still alive after force close: %s", label, alive) def _process_cmdline(pid: int) -> str: try: raw = Path(f"/proc/{pid}/cmdline").read_bytes() except OSError: return "" return raw.replace(b"\0", b" ").decode(errors="replace") def _profile_in_live_process(profile_dir: Path) -> bool: proc_root = Path("/proc") if not proc_root.exists(): return False needle = str(profile_dir) for path in proc_root.iterdir(): if path.name.isdigit() and needle in _process_cmdline(int(path.name)): return True return False def _remove_stale_profile_locks(profile_dir: Path) -> None: if _profile_in_live_process(profile_dir): return for name in (".parentlock", "parent.lock", "lock"): lock_path = profile_dir / name try: if lock_path.exists() or lock_path.is_symlink(): lock_path.unlink() log.warning("Removed stale Zoopla profile lock: %s", lock_path) except OSError as exc: log.debug("Could not remove Zoopla profile lock %s: %s", lock_path, exc) def _exception_detail(exc: BaseException) -> str: detail = " ".join(str(exc).split()) if not detail: detail = repr(exc) return f"{type(exc).__name__}: {detail}" class _ManagedCamoufoxBrowser: def __init__(self, context_manager, browser): self._context_manager = context_manager self._browser = browser self._closed = False def close(self) -> None: if self._closed: return self._closed = True self._browser.close() # Camoufox.__exit__ calls browser.close() itself. The context is already # closed here, so clear it to avoid a second blocking close attempt. self._context_manager.browser = None self._context_manager.__exit__(None, None, None) def force_close(self) -> None: self._closed = True pid = self._driver_pid() if pid is None: log.warning("Zoopla force-close requested but Playwright driver pid is unknown") return _terminate_process_tree(pid, "Zoopla browser force-close") _remove_stale_profile_locks(_zoopla_profile_dir()) def _driver_pid(self) -> int | None: connection = getattr(self._context_manager, "_connection", None) transport = getattr(connection, "_transport", None) proc = getattr(transport, "_proc", None) pid = getattr(proc, "pid", None) return pid if isinstance(pid, int) else None def __getattr__(self, name): return getattr(self._browser, name) # JavaScript to extract listings from the rendered DOM. # Uses data-testid attributes as primary selectors (stable across deployments), # then falls back to href-based link matching with parent-walking. _EXTRACT_LISTINGS_JS = r"""() => { const seen = new Set(); const results = []; // Strategy 1: Use data-testid selectors (post-2025 redesign) const listingCards = document.querySelectorAll( '[data-testid="regular-listings"] > div, [data-testid="search-content"] li' ); for (const card of listingCards) { const link = card.querySelector( 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]' ); if (!link) continue; const href = link.href; const match = href.match(/\/details\/(\d+)\//); if (!match) continue; const id = match[1]; if (seen.has(id)) continue; seen.add(id); const text = card.innerText || ''; // Try data-testid price element first, then regex const priceEl = card.querySelector('[data-testid="listing-price"]'); const priceText = priceEl ? priceEl.innerText : text; const priceMatch = priceText.match(/\u00a3([\d,]+)/); // Try address element first, then regex const addressEl = card.querySelector('address'); let address = addressEl ? addressEl.innerText.trim() : ''; if (!address) { const lines = text.split('\n').map(l => l.trim()).filter(Boolean); for (const line of lines) { if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { address = line; break; } } } const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i); const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i); let tenure = ''; if (/leasehold/i.test(text)) tenure = 'Leasehold'; else if (/freehold/i.test(text)) tenure = 'Freehold'; // Extract property type (e.g., "2 bed flat for sale" → "flat") let property_type = ''; const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i); if (ptMatch) property_type = ptMatch[1].trim(); else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio'; // Keyword fallback when regex doesn't match current DOM format if (!property_type) { const lower = text.toLowerCase(); if (/\bstudio\b/.test(lower)) property_type = 'Studio'; else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse'; else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette'; else if (/\bapartment\b/.test(lower)) property_type = 'Apartment'; else if (/\bflat\b/.test(lower)) property_type = 'Flat'; else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached'; else if (/\bdetached\b/.test(lower)) property_type = 'Detached'; else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced'; else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow'; else if (/\bcottage\b/.test(lower)) property_type = 'Cottage'; else if (/\bhouse\b/.test(lower)) property_type = 'House'; } results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, price_text: priceText.trim(), beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null, floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null, address, tenure, property_type, }); } // Strategy 2: Fall back to href-based link matching with parent-walking if (results.length === 0) { const links = Array.from(document.querySelectorAll( 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]' )); for (const link of links) { const href = link.href; const match = href.match(/\/details\/(\d+)\//); if (!match) continue; const id = match[1]; if (seen.has(id)) continue; seen.add(id); let card = link; for (let j = 0; j < 15; j++) { card = card.parentElement; if (!card) break; const t = card.innerText || ''; if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) { break; } } if (!card) continue; const text = card.innerText || ''; const lines = text.split('\n').map(l => l.trim()).filter(Boolean); const priceEl2 = card.querySelector('[data-testid="listing-price"]'); const priceText2 = priceEl2 ? priceEl2.innerText : text; const priceMatch = priceText2.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i); const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i); let address = ''; for (const line of lines) { if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { address = line; break; } } let tenure = ''; if (/leasehold/i.test(text)) tenure = 'Leasehold'; else if (/freehold/i.test(text)) tenure = 'Freehold'; // Extract property type let property_type = ''; const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i); if (ptMatch2) property_type = ptMatch2[1].trim(); else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio'; // Keyword fallback when regex doesn't match current DOM format if (!property_type) { const lower = text.toLowerCase(); if (/\bstudio\b/.test(lower)) property_type = 'Studio'; else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse'; else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette'; else if (/\bapartment\b/.test(lower)) property_type = 'Apartment'; else if (/\bflat\b/.test(lower)) property_type = 'Flat'; else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached'; else if (/\bdetached\b/.test(lower)) property_type = 'Detached'; else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced'; else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow'; else if (/\bcottage\b/.test(lower)) property_type = 'Cottage'; else if (/\bhouse\b/.test(lower)) property_type = 'House'; } results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, price_text: priceText2.trim(), beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null, floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null, address, tenure, property_type, }); } } return results; }""" # JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM). _DISMISS_COOKIES_JS = """() => { const aside = document.querySelector('#usercentrics-cmp-ui'); if (aside && aside.shadowRoot) { const btns = aside.shadowRoot.querySelectorAll('button'); for (const btn of btns) { if (btn.innerText.includes('Accept')) { btn.click(); return true; } } } if (aside) { aside.remove(); return true; } return false; }""" # --------------------------------------------------------------------------- # Browser lifecycle # --------------------------------------------------------------------------- _FALSE_ENV_VALUES = {"0", "false", "no", "off"} _TRUE_ENV_VALUES = {"1", "true", "yes", "on"} def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str: raw = os.environ.get(name) if raw is None: return default value = raw.strip().lower() if value == "virtual": return "virtual" if value in _TRUE_ENV_VALUES: return True if value in _FALSE_ENV_VALUES: return False raise ValueError( f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual" ) def _visible_display_available() -> bool: if sys.platform.startswith("linux"): return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY")) return True def _zoopla_headless_mode() -> bool | str: # Prefer a visible browser by default so Cloudflare can be completed by the # person running the scrape. In display-less Linux shells, keep startup # headless and fail fast with an actionable error if a challenge appears. default: bool | str = not _visible_display_available() return _env_bool_or_virtual("ZOOPLA_HEADLESS", default) def _zoopla_profile_dir() -> Path: raw = os.environ.get("ZOOPLA_PROFILE_DIR") if raw: return Path(raw).expanduser().resolve() return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve() def _challenge_timeout_seconds() -> int: raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS") if raw is None: return 300 try: timeout = int(raw) except ValueError as exc: raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc if timeout < 1: raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero") return timeout # --------------------------------------------------------------------------- # Gluetun IP rotation # --------------------------------------------------------------------------- # # When Cloudflare Turnstile fires mid-scrape, the cheapest unblocker is to # swap the egress IP via Gluetun's HTTP control server. We stop and re-start # the VPN, poll until the public IP changes, drop the stale cf_clearance # cookies (bound to the previous IP), then reload and re-check the challenge. def _gluetun_base_url() -> str: return GLUETUN_CONTROL_URL.rstrip("/") def _gluetun_api_key() -> str | None: return GLUETUN_API_KEY def _gluetun_max_rotations() -> int: return max(GLUETUN_MAX_ROTATIONS, 0) def _gluetun_client() -> httpx.Client: # Talks to the control server directly (not through the VPN proxy). headers = {} api_key = _gluetun_api_key() if api_key: headers["X-API-Key"] = api_key return httpx.Client(headers=headers) def _gluetun_public_ip(client: httpx.Client) -> str | None: try: resp = client.get(f"{_gluetun_base_url()}/v1/publicip/ip", timeout=5.0) if resp.status_code != 200: return None data = resp.json() except (httpx.HTTPError, ValueError): return None return data.get("public_ip") or data.get("ip") def _gluetun_set_vpn_status(client: httpx.Client, status: str) -> bool: """PUT /v1/vpn/status with {'status': status}. Returns True on 2xx.""" try: resp = client.put( f"{_gluetun_base_url()}/v1/vpn/status", json={"status": status}, timeout=15.0, ) except httpx.HTTPError as exc: log.warning("Gluetun vpn/status %s failed: %s", status, exc) return False if resp.status_code == 401: log.warning( "Gluetun vpn/status %s: 401 Unauthorized — the API key must be " "authorised for 'PUT /v1/vpn/status' in Gluetun's auth config.toml", status, ) return False if resp.status_code >= 400: log.warning( "Gluetun vpn/status %s returned HTTP %d: %s", status, resp.status_code, resp.text[:200], ) return False return True def _rotate_gluetun_ip(wait_seconds: int = 45) -> bool: """Restart Gluetun's VPN and wait for the public IP to change. Returns True if a new IP was observed within wait_seconds.""" with _gluetun_client() as client: old_ip = _gluetun_public_ip(client) log.info("Requesting Gluetun IP rotation (current IP: %s)", old_ip or "unknown") stop_attempted = False restart_confirmed = False try: stop_attempted = True if not _gluetun_set_vpn_status(client, "stopped"): return False time.sleep(2) restart_confirmed = _gluetun_set_vpn_status(client, "running") if not restart_confirmed: return False deadline = time.monotonic() + wait_seconds while time.monotonic() < deadline: time.sleep(2) new_ip = _gluetun_public_ip(client) if new_ip and new_ip != old_ip: log.info("Gluetun rotated IP: %s -> %s", old_ip or "?", new_ip) return True finally: if stop_attempted and not restart_confirmed: log.warning( "Gluetun VPN may be stopped after failed rotation; attempting recovery start" ) if not _gluetun_set_vpn_status(client, "running"): log.error( "Gluetun VPN recovery start failed; manual intervention required" ) log.warning("Gluetun IP did not change within %ds", wait_seconds) return False def _clear_cloudflare_cookies(page) -> None: """Drop cf_clearance / __cf_bm which are bound to the previous egress IP.""" try: context = page.context except Exception: return for name in ("cf_clearance", "__cf_bm"): try: context.clear_cookies(name=name) except Exception as exc: log.debug("Could not clear cookie %s: %s", name, exc) def _rotate_and_retry_challenge(page, max_rotations: int) -> bool: """Rotate IP and reload until the challenge clears. Returns True on success.""" for attempt in range(1, max_rotations + 1): log.warning( "Cloudflare Turnstile challenge — rotating Gluetun IP (attempt %d/%d)", attempt, max_rotations, ) if not _rotate_gluetun_ip(): continue _clear_cloudflare_cookies(page) try: page.reload(wait_until="domcontentloaded", timeout=30000) except Exception as exc: log.warning("Reload after IP rotation failed: %s", exc) continue time.sleep(2) if not _is_turnstile_challenge(page): log.info("Cloudflare challenge cleared after Gluetun rotation") return True return False def _is_turnstile_challenge(page) -> bool: try: if "just a moment" in page.title().lower(): return True except Exception: pass try: return bool( page.query_selector( 'iframe[src*="challenges.cloudflare.com"], ' 'input[name="cf-turnstile-response"]' ) ) except Exception: return False def _wait_for_turnstile(page, headless_mode: bool | str) -> None: if not _is_turnstile_challenge(page): return # Try Gluetun IP rotation first — works in any mode and is the only option # in headless/unattended runs where no human can click the challenge. max_rotations = _gluetun_max_rotations() if max_rotations > 0 and _rotate_and_retry_challenge(page, max_rotations): return profile_dir = _zoopla_profile_dir() if headless_mode is True or headless_mode == "virtual": raise TurnstileError( "Cloudflare Turnstile persisted after " f"{max_rotations} Gluetun IP rotation(s). " "Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0 " f"to solve manually; the session will be saved in {profile_dir}." ) timeout = _challenge_timeout_seconds() log.warning( "Gluetun rotation insufficient — falling back to interactive solve. " "Complete the Cloudflare challenge in the Zoopla browser window; " "waiting up to %ds. Profile: %s", timeout, profile_dir, ) try: page.bring_to_front() except Exception: pass deadline = time.monotonic() + timeout while time.monotonic() < deadline: time.sleep(3) if not _is_turnstile_challenge(page): log.info("Cloudflare challenge resolved") return raise TurnstileError( f"Cloudflare Turnstile was not completed after {timeout}s" ) def launch_browser(): """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile, and dismiss cookie consent. Returns (browser, page) tuple. Raises TurnstileError if Cloudflare cannot be completed. Caller must close browser when done.""" from camoufox.pkgman import camoufox_path # Standalone local runs should not require the old container image to have # pre-fetched Camoufox. camoufox_path(download_if_missing=True) from camoufox.sync_api import Camoufox headless_mode = _zoopla_headless_mode() profile_dir = _zoopla_profile_dir() profile_dir.mkdir(parents=True, exist_ok=True) _remove_stale_profile_locks(profile_dir) # Route the browser through the Gluetun VPN proxy when configured. (geoip # fingerprint alignment is intentionally not enabled: it needs the optional # camoufox[geoip] extra and would spoof to the VPN exit's country, which # fights the en-GB locale unless the exit is in the UK.) proxy_options: dict = {} if GLUETUN_PROXY: proxy_options = {"proxy": {"server": GLUETUN_PROXY}} log.info( "Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...", headless_mode, profile_dir, GLUETUN_PROXY or "direct", ) camoufox = Camoufox( headless=headless_mode, persistent_context=True, user_data_dir=str(profile_dir), locale=["en-GB", "en"], enable_cache=True, **proxy_options, ) raw_browser = camoufox.__enter__() browser = _ManagedCamoufoxBrowser(camoufox, raw_browser) page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page() try: log.info("Navigating to Zoopla homepage...") page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000) _wait_for_turnstile(page, headless_mode) log.info("Zoopla browser ready — title: %s", page.title()) time.sleep(2) # Dismiss cookie consent page.evaluate(_DISMISS_COOKIES_JS) time.sleep(1) except Exception: try: page.close() finally: browser.close() raise return browser, page def _ensure_not_challenged(page) -> None: """Check if current page is a Cloudflare challenge and wait/raise.""" _wait_for_turnstile(page, _zoopla_headless_mode()) # --------------------------------------------------------------------------- # Search navigation # --------------------------------------------------------------------------- def _wait_for_listing_content(page) -> None: """Wait for rendered listing cards to contain usable text.""" try: page.wait_for_function( """() => { const cards = document.querySelectorAll( '[data-testid="regular-listings"] > div' ); if (cards.length === 0) return false; for (const card of cards) { const t = card.innerText || ''; if (t.includes('\\u00a3') && t.length > 50) return true; } return false; }""", timeout=8000, ) except Exception: time.sleep(1.5) def _navigate_search(page, outcode: str) -> bool: """Navigate directly to sale search results for an outcode. Returns True if results were found, False if no results or navigation failed. Raises TurnstileError if Cloudflare blocks us.""" url = ( f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/" f"?q={outcode}&search_source=home" ) try: page.goto(url, wait_until="domcontentloaded", timeout=30000) except Exception as exc: detail = _exception_detail(exc) log.warning("Zoopla direct navigation failed for %s: %s", outcode, detail) raise RuntimeError( f"Zoopla direct navigation failed for {outcode}: {detail}" ) from exc _ensure_not_challenged(page) # Dismiss cookie consent (may reappear after navigation) try: page.evaluate(_DISMISS_COOKIES_JS) except Exception: pass try: page.wait_for_selector( '[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]', timeout=10000, ) except Exception: if not page.query_selector('a[href*="/details/"]'): return False _wait_for_listing_content(page) return True def _get_result_count(page) -> int: """Extract the total results count from the page. Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex matching both "N results" and "N properties" patterns.""" try: # Try the ZAD targeting JSON script tag first count = page.evaluate("""() => { const s = document.querySelector('#__ZAD_TARGETING__'); if (s) { try { const d = JSON.parse(s.textContent); if (d.search_results_count != null) return d.search_results_count; } catch(e) {} } return null; }""") if count is not None and count > 0: return count except Exception: pass try: body = page.inner_text("body") match = re.search(r"([\d,]+)\s+(?:results?|properties)", body) if match: return int(match.group(1).replace(",", "")) except Exception: pass return 0 def _url_with_page(url: str, page_num: int) -> str: parsed = urlparse(url) query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"] query.append(("pn", str(page_num))) return urlunparse(parsed._replace(query=urlencode(query))) def _find_next_page_url(page) -> str | None: """Return the rendered pagination next URL, if Zoopla exposes one.""" try: href = page.evaluate( """() => { const links = Array.from(document.querySelectorAll('a[href]')); const next = links.find((link) => { const text = (link.innerText || link.textContent || '') .trim() .toLowerCase(); const label = (link.getAttribute('aria-label') || '').toLowerCase(); const rel = (link.getAttribute('rel') || '').toLowerCase(); return rel.includes('next') || label.includes('next') || text === 'next' || text === 'next page'; }); if (!next) return null; const href = next.href || ''; if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) { return null; } return href; }""" ) except Exception as exc: detail = _exception_detail(exc) log.warning("Zoopla next-page detection failed: %s", detail) raise RuntimeError(f"Zoopla next-page detection failed: {detail}") from exc if not href: return None return urljoin(ZOOPLA_BASE, href) # --------------------------------------------------------------------------- # Extraction and pagination # --------------------------------------------------------------------------- _first_extraction_logged = False def _extract_listings(page) -> list[dict]: """Extract listing data from the current search results page DOM.""" global _first_extraction_logged try: listings = page.evaluate(_EXTRACT_LISTINGS_JS) # Log diagnostic info on the very first extraction attempt if not _first_extraction_logged: _first_extraction_logged = True try: diag = page.evaluate("""() => { const details = document.querySelectorAll('a[href*="/details/"]'); const testids = document.querySelectorAll('[data-testid]'); const testidNames = [...new Set([...testids].map(e => e.dataset.testid))]; return { url: location.href, title: document.title, detailLinks: details.length, testids: testidNames.slice(0, 30), bodySnippet: document.body?.innerText?.slice(0, 500) || '', }; }""") log.info( "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d " "testids=%s bodySnippet=%.200s", diag.get("url"), diag.get("title"), diag.get("detailLinks", 0), diag.get("testids", []), diag.get("bodySnippet", ""), ) except Exception: pass log.info("Zoopla first extraction: %d listings found", len(listings)) return listings except Exception as e: detail = _exception_detail(e) log.warning("Failed to extract listings from DOM: %s", detail) raise RuntimeError(f"Zoopla DOM extraction failed: {detail}") from e def _paginate( page, total_results: int, max_properties: int | None = None, fetch_detail=None, detail_cap: int = 0, detail_state: dict | None = None, detail_deadline: float | None = None, ) -> list[dict]: """Extract listings from all pages of search results. Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered next link when present, otherwise advance via the pn=N URL parameter while the advertised result count says more listings remain. When ``fetch_detail`` is supplied, each listing has its detail page fetched (up to ``detail_cap`` fresh loads per outcode, counted in the shared ``detail_state`` dict, and only until ``detail_deadline``) and the parsed geo stored under ``listing['_detail']`` for ``transform_property``. The detail page is the only source of the listing's UPRN, full street address and precise postcode, so it is fetched even when the search card already pins a full postcode. Cached detail results are always attached but cost neither a cap slot nor a delay.""" def _maybe_fetch(listing: dict) -> None: if fetch_detail is None or detail_state is None: return url = listing.get("url", "") cached = _detail_cache_key(url) in _detail_cache if not cached: # Fresh loads are bounded by the per-outcode cap and the wall-clock # deadline so detail fetching never starves the SIGALRM budget that # also guards the search pagination for this outcode. if detail_state["fetched"] >= detail_cap: return if detail_deadline is not None and time.monotonic() >= detail_deadline: return listing["_detail"] = fetch_detail(url) if not cached: detail_state["fetched"] += 1 time.sleep(DELAY_BETWEEN_PAGES) all_listings = _extract_listings(page) for listing in all_listings: _maybe_fetch(listing) if max_properties is not None and len(all_listings) >= max_properties: return all_listings[:max_properties] if not all_listings: return all_listings seen_ids = {listing["id"] for listing in all_listings} page_num = 2 while True: next_url = _find_next_page_url(page) if not next_url: if total_results > 0 and len(all_listings) >= total_results: break next_url = _url_with_page(page.url, page_num) time.sleep(DELAY_BETWEEN_PAGES) try: page.goto(next_url, wait_until="domcontentloaded", timeout=30000) _ensure_not_challenged(page) _wait_for_listing_content(page) except TurnstileError: raise except Exception as e: detail = _exception_detail(e) log.warning( "Zoopla pagination navigation failed at page %d: %s", page_num, detail, ) raise RuntimeError( f"Zoopla pagination navigation failed at page {page_num}: {detail}" ) from e page_listings = _extract_listings(page) if not page_listings: if total_results > len(all_listings): raise RuntimeError( "Zoopla pagination stopped with no listings on page " f"{page_num}; collected {len(all_listings)} of " f"{total_results} advertised results" ) break # Deduplicate within this outcode new_count = 0 for listing in page_listings: if listing["id"] not in seen_ids: seen_ids.add(listing["id"]) all_listings.append(listing) _maybe_fetch(listing) new_count += 1 if max_properties is not None and len(all_listings) >= max_properties: return all_listings[:max_properties] if new_count == 0: if total_results > len(all_listings): raise RuntimeError( "Zoopla pagination repeated results on page " f"{page_num}; collected {len(all_listings)} of " f"{total_results} advertised results" ) break page_num += 1 if total_results > 0 and len(all_listings) >= total_results: if not _find_next_page_url(page): break return all_listings # --------------------------------------------------------------------------- # Property transformation # --------------------------------------------------------------------------- # Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans # over 2.26M postcodes. Populated lazily on first lookup per outcode. _outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {} def _resolve_outcode_coords( outcode: str, pc_coords: dict[str, tuple[float, float]] ) -> tuple[str, float, float] | None: """Find first postcode + coords for an outcode. Result is cached.""" if outcode in _outcode_coords_cache: return _outcode_coords_cache[outcode] prefix = outcode + " " for pcd, (lat, lng) in pc_coords.items(): if pcd.startswith(prefix) or ( len(outcode) >= 4 and pcd.startswith(outcode) and len(pcd) > len(outcode) ): _outcode_coords_cache[outcode] = (pcd, lat, lng) return (pcd, lat, lng) _outcode_coords_cache[outcode] = None return None def _extract_outcode(text: str) -> str | None: """Extract a UK outcode from address text like 'Whitechapel Road, London E1'.""" # Look for outcode at end of string or after last comma match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE) if match: return match.group(1).upper() # Try after comma parts = text.split(",") if len(parts) > 1: last = parts[-1].strip() match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE) if match: return match.group(1).upper() return None # --------------------------------------------------------------------------- # Detail-page geocoding # --------------------------------------------------------------------------- # # Zoopla search result cards only expose an outcode-level display address (e.g. # "South Street, Bromley BR1"); the full postcode and precise coordinates exist # only on each listing's detail page (/for-sale/details/{id}/). The detail page # is a Next.js App Router route whose React Server Components flight stream # embeds the property's own location object, e.g. # "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614}, # "uprn":"10023461458","postalCode":"NR29 4RG",...} # plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":..., # "outcode":...,"postcode":...,"uprn":...} feeding the map widgets. # Nearby points of interest (stations, schools, EV chargers) and comparable # listings carry their own "coordinates" too, but never inside the property's # own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a # loose coordinates object, is what we anchor on (see parse_detail_geo). # listingId -> parsed detail dict (or None). Failures are cached too, so a # broken listing is not re-fetched within a run (the same listing reappears # across overlapping outcode searches). _detail_cache: dict[str, dict | None] = {} _LISTING_ID_RE = re.compile(r"/details/(\d+)/?") # The property's own location is carried by a `"location":{...}` wrapper and a # twin `"address":{"fullAddress":...}` widget object. We anchor on those # wrappers (and capture their full object body, which contains exactly one # nested object — `coordinates`) rather than scanning for loose coordinate # objects: nearby points of interest (stations/schools/EV chargers) and # comparable/"similar" listings also embed coordinates, but never inside the # property's own `"location"` / `"address":{"fullAddress"` wrapper, so the # wrapper is the discriminator. Field order and an optional `uprn` are tolerated. _DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}') _DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}') _DETAIL_COORDS_IN_BODY_RE = re.compile( r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}' ) _DETAIL_LATLNG_IN_BODY_RE = re.compile( r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)' ) _DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"') # The location object spells it "postalCode"; the address twin uses "postcode". _DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"') # The UPRN (Unique Property Reference Number) appears in both the location and # address objects and is the linchpin for an exact listing->EPC join (EPC open # data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name # (e.g. "12", "Martham Mill") only in the location object. _DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"') _DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"') def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None: """Extract the property's own coordinates/postcode from a Zoopla detail page. Pure and browser-free: the live browser only produces the HTML string (``page.content()``); this does the parsing so it is unit-testable. Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn", "number_or_name", "full_address"}`` (every field except the coordinates may be ``None``) or ``None`` when no property location wrapper is found. The ``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house number/name) and ``full_address`` give a register-style address for the Price Paid join. Coordinates are bounds-checked to England and a postcode is kept only when it agrees with its own object's outcode. ``search_outcode``, when given, is used only as a tie-break to pick the right ``location`` object on pages that also embed comparable listings. See module docstring for the data model.""" if not html: return None # RSC flight strings are embedded as escaped JS string literals, so quotes # and slashes arrive escaped; normalize them so the regexes match. buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/") def in_england(lat: float, lng: float) -> tuple[float, float] | None: lat, lng = fix_coords(lat, lng) if 49 <= lat <= 56 and -7 <= lng <= 2: return lat, lng return None def build(body: str, coords, source: str, full_address: str | None = None) -> dict: # outcode and postcode are read from the SAME object body as the coords, # so the postcode is self-consistent; drop it only if it somehow isn't. outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body) outcode = outcode_match.group(1) if outcode_match else None postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body) postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None if postcode and outcode and extract_outcode(postcode) != outcode.upper(): postcode = None uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body) number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body) number_or_name = number_match.group(1).strip() if number_match else None return { "lat": coords[0], "lng": coords[1], "postcode": postcode, "outcode": outcode, "source": source, "uprn": uprn_match.group(1) if uprn_match else None, "number_or_name": number_or_name or None, "full_address": full_address, } def attach_full_address(result: dict | None) -> dict | None: # The house-numbered street address lives in the `address` map-widget # twin, not the `location` wrapper we anchor coordinates on. Pull it from # the twin that shares this property's uprn; when there is no uprn to # disambiguate, fall back to the first twin (document order = primary # listing), but never guess a twin when a uprn exists and none matches — # that would risk grabbing a comparable listing's address. if result is None or result.get("full_address"): return result target = result.get("uprn") first = None for match in _DETAIL_ADDRESS_RE.finditer(buf): full_address = match.group(1) or None if full_address is None: continue if first is None: first = full_address uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2)) if target and uprn_match and uprn_match.group(1) == target: result["full_address"] = full_address return result if target is None: result["full_address"] = first return result # Strategy 1 — the property's own `location` wrapper (authoritative). Take # the first match (the primary listing precedes any comparables in the # flight stream), but prefer one whose outcode matches the searched outcode. first_location = None for match in _DETAIL_LOCATION_RE.finditer(buf): body = match.group(1) coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body) if not coords_match: continue coords = in_england(float(coords_match.group(1)), float(coords_match.group(2))) if not coords: continue candidate = build(body, coords, "detail_location") if first_location is None: first_location = candidate if ( search_outcode and candidate["outcode"] and candidate["outcode"].upper() == search_outcode.upper() ): return attach_full_address(candidate) if first_location is not None: return attach_full_address(first_location) # Strategy 2 — the `address` map-widget twin (same coordinates, backup). for match in _DETAIL_ADDRESS_RE.finditer(buf): full_address = match.group(1) or None body = match.group(2) latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body) if not latlng_match: continue coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2))) if coords: return build(body, coords, "detail_address_obj", full_address=full_address) return None def _detail_cache_key(listing_url: str) -> str: """Cache key for a listing detail page — its numeric id when present.""" id_match = _LISTING_ID_RE.search(listing_url) return id_match.group(1) if id_match else listing_url def _fetch_listing_detail( detail_page, listing_url: str, search_outcode: str | None = None, ) -> dict | None: """Load a listing detail page and return its parsed geo dict (or None). Results (including failures) are cached by listingId. Ordinary navigation and extraction errors are swallowed so the caller can fall back to outcode-level resolution, but TurnstileError is allowed to propagate so the scraper's "Cloudflare ends the run" contract still holds. The goto timeout is kept short so one slow detail page can't eat the per-outcode budget.""" cache_key = _detail_cache_key(listing_url) if cache_key in _detail_cache: return _detail_cache[cache_key] url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url result: dict | None = None try: detail_page.goto( url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS ) _ensure_not_challenged(detail_page) html = detail_page.content() result = parse_detail_geo(html, search_outcode=search_outcode) except TurnstileError: raise except Exception as exc: log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc)) result = None _detail_cache[cache_key] = result return result def _map_property_type(raw_type: str | None) -> str: """Map Zoopla property type text to canonical type.""" if not raw_type: return "Other" # Exact match (handles Rightmove-style capitalised values) canonical = PROPERTY_TYPE_MAP.get(raw_type) if canonical: return canonical # Title-case match (handles regex-extracted lowercase like "town house" → "Town House") canonical = PROPERTY_TYPE_MAP.get(raw_type.title()) if canonical: return canonical # Lowercase match (e.g., "Townhouse" → "townhouse") canonical = PROPERTY_TYPE_MAP.get(raw_type.lower()) if canonical: return canonical # Normalize delimiters (underscores/hyphens → spaces) and try again normalized = re.sub(r"[-_]+", " ", raw_type).strip().title() canonical = PROPERTY_TYPE_MAP.get(normalized) if canonical: return canonical # Keyword fallback lower = raw_type.lower() excluded_flat_like = ( "block of apartment", "house of multiple occupation", "private halls", "retirement", "serviced apartment", ) if any(term in lower for term in excluded_flat_like): return "Other" if ( "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower ): return "Flats/Maisonettes" if "semi" in lower and "detach" in lower: return "Semi-Detached" if "detach" in lower: return "Detached" if "terrace" in lower or "mews" in lower: return "Terraced" if "house" in lower: return "Detached" return "Other" def transform_property( raw: dict, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], search_outcode: str | None = None, detail: dict | None = None, ) -> dict | None: """Transform a raw Zoopla listing dict into the standard output schema. Zoopla search cards only expose an outcode-level address, so precise location comes from the listing's detail page (see ``parse_detail_geo`` / ``_fetch_listing_detail``), passed in as ``detail``. When detail-page coordinates are available we resolve the nearest postcode via the spatial index — mirroring rightmove/onthemarket — and only fall back to the coarse outcode centroid when no detail location could be obtained.""" price = parse_int_value(raw.get("price")) or 0 address = raw.get("address", "") or "" extracted_postcode = extract_full_postcode(address) detail = detail or {} detail_postcode = extract_full_postcode(detail.get("postcode")) # Detail-page address fields: the UPRN keys an exact EPC join, and the # full street address / house number-or-name beat the outcode-level card # address for the Price-Paid join. All three are absent unless the detail # page was fetched, so every consumer must tolerate None. detail_uprn = detail.get("uprn") or None detail_full_address = detail.get("full_address") or None detail_number_or_name = detail.get("number_or_name") or None postcode = postcode_source = inferred_postcode = None lat = lng = None # (A) Best: detail-page coordinates -> nearest postcode (authoritative). detail_lat, detail_lng = detail.get("lat"), detail.get("lng") if detail_lat is not None and detail_lng is not None: fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng) if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2: nearest = pc_index.nearest(fixed_lat, fixed_lng) if nearest: lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest candidate = detail_postcode or extracted_postcode postcode, resolved_source = resolve_listing_postcode(candidate, nearest) postcode_source = ( "detail_address" if resolved_source == "address" else "detail_coordinates" ) # (B) Detail-page postcode without usable coordinates -> geocode it. if lat is None and detail_postcode and detail_postcode in pc_coords: lat, lng = pc_coords[detail_postcode] postcode = inferred_postcode = detail_postcode postcode_source = "detail_address" # (C) Full postcode in the search-card address -> geocode it. if lat is None and extracted_postcode and extracted_postcode in pc_coords: lat, lng = pc_coords[extracted_postcode] postcode = extracted_postcode postcode_source = "address" # (D) Last resort: coarse outcode-level centroid (loses per-listing precision). if lat is None: addr_outcode = _extract_outcode(address) if addr_outcode: result = _resolve_outcode_coords(addr_outcode, pc_coords) if result: postcode, lat, lng = result postcode_source = "address_outcode" if lat is None and search_outcode: result = _resolve_outcode_coords(search_outcode, pc_coords) if result: postcode, lat, lng = result postcode_source = "search_outcode" if lat is None or lng is None or not postcode: return None # Validate coordinates are in England if not (49 <= lat <= 56 and -7 <= lng <= 2): return None raw_beds = parse_int_value(raw.get("beds")) or 0 raw_baths = parse_int_value(raw.get("baths")) or 0 bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0 bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0 if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: log.warning( "Zoopla %s: implausible beds=%d baths=%d (capped to 0)", raw.get("id", "?"), raw_beds, raw_baths, ) receptions = raw.get("receptions") or 0 # Floor area: convert sq ft to sq m floor_area_sqm = None raw_sqm = raw.get("floor_area_sqm") if raw_sqm: floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1)) else: sqft = raw.get("floor_area_sqft") if sqft: floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1)) listing_id = raw.get("id", "") listing_url = raw.get("url", "") if listing_url and not listing_url.startswith("http"): listing_url = ZOOPLA_BASE + listing_url return { "id": f"zp_{listing_id}", "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + receptions, "lon": lng, "lat": lat, "Postcode": postcode, "Postcode source": postcode_source or "unknown", "Extracted postcode": extracted_postcode, "Inferred postcode": ( inferred_postcode if inferred_postcode is not None else (postcode if postcode_source != "address" else None) ), "Listing raw address": detail_full_address or address, "Address per Property Register": build_register_address( detail_full_address or address, detail_number_or_name ), "UPRN": detail_uprn, "Property number or name": detail_number_or_name, "Leasehold/Freehold": raw.get("tenure") or None, "Property type": _map_property_type(raw.get("property_type")), "Property sub-type": normalize_sub_type(raw.get("property_type")), "price": price, "price_frequency": "", "Price qualifier": "", "Total floor area (sqm)": floor_area_sqm, "Listing URL": listing_url, "Listing features": [], "first_visible_date": "", } # --------------------------------------------------------------------------- # Top-level search function (called by scraper.py) # --------------------------------------------------------------------------- def search_outcode( page, outcode: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], max_properties: int | None = None, detail_page=None, detail_cap: int = 0, detail_budget_seconds: float | None = None, ) -> tuple[list[dict], str | None]: """Search Zoopla for properties in one outcode. Takes a live Camoufox Page (from launch_browser). Navigates through the search flow, extracts listings from rendered DOM, and transforms to the standard output schema. When ``detail_page`` (a second browser tab) and a positive ``detail_cap`` are supplied, up to ``detail_cap`` listings per outcode have their detail page fetched for a precise postcode (see ``_fetch_listing_detail``). ``detail_budget_seconds`` caps the wall-clock time spent fetching details so the per-outcode timeout that also guards search pagination is never starved. Returns (properties, search_url). Raises TurnstileError if Cloudflare blocks us mid-session. """ if not _navigate_search(page, outcode): return [], None total_results = _get_result_count(page) fetch_detail = None detail_deadline = None if detail_page is not None and detail_cap > 0: fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731 detail_page, url, search_outcode=outcode ) if detail_budget_seconds is not None: detail_deadline = time.monotonic() + detail_budget_seconds # Always try extraction even if result count is 0 — the count regex may # not match Zoopla's current text format, but listings may still be in DOM raw_listings = _paginate( page, total_results, max_properties=max_properties, fetch_detail=fetch_detail, detail_cap=detail_cap, detail_state={"fetched": 0}, detail_deadline=detail_deadline, ) if not raw_listings: if total_results > 0: log.debug( "Zoopla %s %s: page claims %d results but extraction found 0 — " "DOM selectors may need updating", outcode, "BUY", total_results, ) return [], None properties = [] dropped = 0 for raw in raw_listings: try: transformed = transform_property( raw, pc_index, pc_coords, search_outcode=outcode, detail=raw.get("_detail"), ) except Exception as exc: log.warning( "Zoopla %s property %s failed to transform: %s", outcode, raw.get("id", "?"), exc, ) transformed = None if transformed: properties.append(transformed) else: dropped += 1 if dropped and not properties: # Log a sample raw listing to diagnose which fields are missing sample = raw_listings[0] if raw_listings else {} log.debug( "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform " "(no postcode/coords). Sample raw: price=%s address=%r", outcode, "BUY", len(raw_listings), dropped, sample.get("price"), sample.get("address", ""), ) elif dropped > len(raw_listings) // 2: log.debug( "Zoopla %s %s: %d/%d listings dropped in transform", outcode, "BUY", dropped, len(raw_listings), ) return properties, page.url