All good
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled

This commit is contained in:
Andras Schmelczer 2026-05-18 21:20:10 +01:00
parent 6ea544a0f6
commit 6cc7288126
45 changed files with 929 additions and 1043 deletions

View file

@ -1,8 +1,8 @@
"""Zoopla (zoopla.co.uk) scraper — sale properties.
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
Zoopla uses Next.js App Router with React Server Components (RSC). Search
result data is server-rendered in an RSC stream, not available via
@ -19,11 +19,20 @@ Architecture:
"""
import logging
import os
import re
import sys
import time
from pathlib import Path
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from constants import (
DATA_DIR,
DELAY_BETWEEN_PAGES,
MAX_BEDROOMS,
PROPERTY_TYPE_MAP,
ZOOPLA_BASE,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, parse_int_value, validate_floor_area
@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => {
# ---------------------------------------------------------------------------
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
raw = os.environ.get(name)
if raw is None:
return default
value = raw.strip().lower()
if value == "virtual":
return "virtual"
if value in _TRUE_ENV_VALUES:
return True
if value in _FALSE_ENV_VALUES:
return False
raise ValueError(
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
)
def _visible_display_available() -> bool:
if sys.platform.startswith("linux"):
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
return True
def _zoopla_headless_mode() -> bool | str:
# Prefer a visible browser by default so Cloudflare can be completed by the
# person running the scrape. In display-less Linux shells, keep startup
# headless and fail fast with an actionable error if a challenge appears.
default: bool | str = not _visible_display_available()
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
def _zoopla_profile_dir() -> Path:
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
if raw:
return Path(raw).expanduser().resolve()
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
def _challenge_timeout_seconds() -> int:
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
if raw is None:
return 300
try:
timeout = int(raw)
except ValueError as exc:
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
if timeout < 1:
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
return timeout
def _is_turnstile_challenge(page) -> bool:
try:
if "just a moment" in page.title().lower():
return True
except Exception:
pass
try:
return bool(
page.query_selector(
'iframe[src*="challenges.cloudflare.com"], '
'input[name="cf-turnstile-response"]'
)
)
except Exception:
return False
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
if not _is_turnstile_challenge(page):
return
profile_dir = _zoopla_profile_dir()
if headless_mode is True or headless_mode == "virtual":
raise TurnstileError(
"Cloudflare Turnstile requires a visible browser session. "
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
f"the solved session will be saved in {profile_dir}."
)
timeout = _challenge_timeout_seconds()
log.warning(
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
"window; waiting up to %ds. Profile: %s",
timeout,
profile_dir,
)
try:
page.bring_to_front()
except Exception:
pass
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
time.sleep(3)
if not _is_turnstile_challenge(page):
log.info("Cloudflare challenge resolved")
return
raise TurnstileError(
f"Cloudflare Turnstile was not completed after {timeout}s"
)
def launch_browser():
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
and dismiss cookie consent. Returns (browser, page) tuple.
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
Raises TurnstileError if Cloudflare cannot be completed.
Caller must close browser when done."""
from camoufox.pkgman import camoufox_path
@ -269,61 +387,50 @@ def launch_browser():
from camoufox.sync_api import Camoufox
log.info("Launching Camoufox browser for Zoopla...")
camoufox = Camoufox(headless=True)
headless_mode = _zoopla_headless_mode()
profile_dir = _zoopla_profile_dir()
profile_dir.mkdir(parents=True, exist_ok=True)
log.info(
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
headless_mode,
profile_dir,
)
camoufox = Camoufox(
headless=headless_mode,
persistent_context=True,
user_data_dir=str(profile_dir),
locale=["en-GB", "en"],
enable_cache=True,
)
raw_browser = camoufox.__enter__()
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
page = browser.new_page()
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
log.info("Navigating to Zoopla homepage...")
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
try:
log.info("Navigating to Zoopla homepage...")
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
_wait_for_turnstile(page, headless_mode)
# Wait for Cloudflare Turnstile to resolve.
# Try clicking the Turnstile checkbox if present (helps in some cases).
for i in range(40):
if "Just a moment" not in page.title():
break
# Attempt to click the Turnstile checkbox in the challenge iframe
for frame in page.frames:
if "challenges.cloudflare.com" in frame.url:
try:
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
if iframe_el:
box = iframe_el.bounding_box()
if box:
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
except Exception:
pass
break
time.sleep(3)
else:
page.close()
browser.close()
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
log.info("Zoopla browser ready — title: %s", page.title())
time.sleep(2)
log.info("Cloudflare passed — title: %s", page.title())
time.sleep(2)
# Dismiss cookie consent
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
# Dismiss cookie consent
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
except Exception:
try:
page.close()
finally:
browser.close()
raise
return browser, page
def _ensure_not_challenged(page) -> None:
"""Check if current page is a Cloudflare challenge and wait/raise."""
if "Just a moment" not in page.title():
return
log.warning("Cloudflare challenge detected mid-session, waiting...")
for i in range(40):
time.sleep(3)
if "Just a moment" not in page.title():
log.info("Cloudflare challenge resolved")
return
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
_wait_for_turnstile(page, _zoopla_headless_mode())
# ---------------------------------------------------------------------------
@ -704,9 +811,7 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = parse_int_value(raw.get("price"))
if not price or price <= 0:
return None
price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "")
@ -856,7 +961,7 @@ def search_outcode(
sample = raw_listings[0] if raw_listings else {}
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
"(no price/postcode/coords). Sample raw: price=%s address=%r",
"(no postcode/coords). Sample raw: price=%s address=%r",
outcode, "BUY", len(raw_listings), dropped,
sample.get("price"), sample.get("address", ""),
)