All good
This commit is contained in:
parent
6ea544a0f6
commit
6cc7288126
45 changed files with 929 additions and 1043 deletions
209
finder/zoopla.py
209
finder/zoopla.py
|
|
@ -1,8 +1,8 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
|
||||
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
|
|
@ -19,11 +19,20 @@ Architecture:
|
|||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
ZOOPLA_BASE,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||
|
||||
|
|
@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => {
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
|
||||
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
|
||||
raw = os.environ.get(name)
|
||||
if raw is None:
|
||||
return default
|
||||
|
||||
value = raw.strip().lower()
|
||||
if value == "virtual":
|
||||
return "virtual"
|
||||
if value in _TRUE_ENV_VALUES:
|
||||
return True
|
||||
if value in _FALSE_ENV_VALUES:
|
||||
return False
|
||||
raise ValueError(
|
||||
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
|
||||
)
|
||||
|
||||
|
||||
def _visible_display_available() -> bool:
|
||||
if sys.platform.startswith("linux"):
|
||||
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
|
||||
return True
|
||||
|
||||
|
||||
def _zoopla_headless_mode() -> bool | str:
|
||||
# Prefer a visible browser by default so Cloudflare can be completed by the
|
||||
# person running the scrape. In display-less Linux shells, keep startup
|
||||
# headless and fail fast with an actionable error if a challenge appears.
|
||||
default: bool | str = not _visible_display_available()
|
||||
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
|
||||
|
||||
|
||||
def _zoopla_profile_dir() -> Path:
|
||||
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
|
||||
if raw:
|
||||
return Path(raw).expanduser().resolve()
|
||||
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
|
||||
|
||||
|
||||
def _challenge_timeout_seconds() -> int:
|
||||
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
|
||||
if raw is None:
|
||||
return 300
|
||||
try:
|
||||
timeout = int(raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
|
||||
if timeout < 1:
|
||||
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
|
||||
return timeout
|
||||
|
||||
|
||||
def _is_turnstile_challenge(page) -> bool:
|
||||
try:
|
||||
if "just a moment" in page.title().lower():
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return bool(
|
||||
page.query_selector(
|
||||
'iframe[src*="challenges.cloudflare.com"], '
|
||||
'input[name="cf-turnstile-response"]'
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
||||
if not _is_turnstile_challenge(page):
|
||||
return
|
||||
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
if headless_mode is True or headless_mode == "virtual":
|
||||
raise TurnstileError(
|
||||
"Cloudflare Turnstile requires a visible browser session. "
|
||||
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
|
||||
f"the solved session will be saved in {profile_dir}."
|
||||
)
|
||||
|
||||
timeout = _challenge_timeout_seconds()
|
||||
log.warning(
|
||||
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
|
||||
"window; waiting up to %ds. Profile: %s",
|
||||
timeout,
|
||||
profile_dir,
|
||||
)
|
||||
try:
|
||||
page.bring_to_front()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
time.sleep(3)
|
||||
if not _is_turnstile_challenge(page):
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError(
|
||||
f"Cloudflare Turnstile was not completed after {timeout}s"
|
||||
)
|
||||
|
||||
|
||||
def launch_browser():
|
||||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
|
||||
Raises TurnstileError if Cloudflare cannot be completed.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
|
|
@ -269,61 +387,50 @@ def launch_browser():
|
|||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
camoufox = Camoufox(headless=True)
|
||||
headless_mode = _zoopla_headless_mode()
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
log.info(
|
||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
||||
headless_mode,
|
||||
profile_dir,
|
||||
)
|
||||
camoufox = Camoufox(
|
||||
headless=headless_mode,
|
||||
persistent_context=True,
|
||||
user_data_dir=str(profile_dir),
|
||||
locale=["en-GB", "en"],
|
||||
enable_cache=True,
|
||||
)
|
||||
raw_browser = camoufox.__enter__()
|
||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||
page = browser.new_page()
|
||||
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
try:
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
_wait_for_turnstile(page, headless_mode)
|
||||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(40):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
for frame in page.frames:
|
||||
if "challenges.cloudflare.com" in frame.url:
|
||||
try:
|
||||
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
|
||||
if iframe_el:
|
||||
box = iframe_el.bounding_box()
|
||||
if box:
|
||||
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
time.sleep(3)
|
||||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
|
||||
log.info("Zoopla browser ready — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
except Exception:
|
||||
try:
|
||||
page.close()
|
||||
finally:
|
||||
browser.close()
|
||||
raise
|
||||
|
||||
return browser, page
|
||||
|
||||
|
||||
def _ensure_not_challenged(page) -> None:
|
||||
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
||||
if "Just a moment" not in page.title():
|
||||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(40):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
|
||||
_wait_for_turnstile(page, _zoopla_headless_mode())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -704,9 +811,7 @@ def transform_property(
|
|||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = parse_int_value(raw.get("price"))
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
price = parse_int_value(raw.get("price")) or 0
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
||||
|
|
@ -856,7 +961,7 @@ def search_outcode(
|
|||
sample = raw_listings[0] if raw_listings else {}
|
||||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
"(no postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, "BUY", len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue