has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -21,11 +21,14 @@ Architecture:
import logging
import os
import re
import signal
import sys
import time
from pathlib import Path
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
import httpx
from constants import (
DATA_DIR,
DELAY_BETWEEN_PAGES,
@ -43,6 +46,119 @@ class TurnstileError(Exception):
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
def _pid_exists(pid: int) -> bool:
try:
os.kill(pid, 0)
except ProcessLookupError:
return False
except PermissionError:
return True
return True
def _proc_ppid(pid: int) -> int | None:
try:
for line in Path(f"/proc/{pid}/status").read_text().splitlines():
if line.startswith("PPid:"):
return int(line.split()[1])
except (OSError, ValueError):
return None
return None
def _proc_descendants(root_pid: int) -> set[int]:
proc_root = Path("/proc")
if not proc_root.exists():
return set()
children: dict[int, list[int]] = {}
for path in proc_root.iterdir():
if not path.name.isdigit():
continue
pid = int(path.name)
ppid = _proc_ppid(pid)
if ppid is not None:
children.setdefault(ppid, []).append(pid)
descendants: set[int] = set()
stack = list(children.get(root_pid, []))
while stack:
pid = stack.pop()
if pid in descendants:
continue
descendants.add(pid)
stack.extend(children.get(pid, []))
return descendants
def _terminate_process_tree(root_pid: int, label: str) -> None:
if root_pid <= 0 or root_pid == os.getpid():
return
pids = _proc_descendants(root_pid) | {root_pid}
for sig, sig_name, delay in (
(signal.SIGTERM, "SIGTERM", 1.0),
(signal.SIGKILL, "SIGKILL", 0.5),
):
alive = [pid for pid in sorted(pids, reverse=True) if _pid_exists(pid)]
if not alive:
return
log.warning("%s: sending %s to %d process(es)", label, sig_name, len(alive))
for pid in alive:
try:
os.kill(pid, sig)
except ProcessLookupError:
pass
except OSError as exc:
log.debug("%s: could not signal pid %d: %s", label, pid, exc)
time.sleep(delay)
alive = [pid for pid in sorted(pids) if _pid_exists(pid)]
if alive:
log.warning("%s: process(es) still alive after force close: %s", label, alive)
def _process_cmdline(pid: int) -> str:
try:
raw = Path(f"/proc/{pid}/cmdline").read_bytes()
except OSError:
return ""
return raw.replace(b"\0", b" ").decode(errors="replace")
def _profile_in_live_process(profile_dir: Path) -> bool:
proc_root = Path("/proc")
if not proc_root.exists():
return False
needle = str(profile_dir)
for path in proc_root.iterdir():
if path.name.isdigit() and needle in _process_cmdline(int(path.name)):
return True
return False
def _remove_stale_profile_locks(profile_dir: Path) -> None:
if _profile_in_live_process(profile_dir):
return
for name in (".parentlock", "parent.lock", "lock"):
lock_path = profile_dir / name
try:
if lock_path.exists() or lock_path.is_symlink():
lock_path.unlink()
log.warning("Removed stale Zoopla profile lock: %s", lock_path)
except OSError as exc:
log.debug("Could not remove Zoopla profile lock %s: %s", lock_path, exc)
def _exception_detail(exc: BaseException) -> str:
detail = " ".join(str(exc).split())
if not detail:
detail = repr(exc)
return f"{type(exc).__name__}: {detail}"
class _ManagedCamoufoxBrowser:
def __init__(self, context_manager, browser):
self._context_manager = context_manager
@ -53,10 +169,27 @@ class _ManagedCamoufoxBrowser:
if self._closed:
return
self._closed = True
try:
self._browser.close()
finally:
self._context_manager.__exit__(None, None, None)
self._browser.close()
# Camoufox.__exit__ calls browser.close() itself. The context is already
# closed here, so clear it to avoid a second blocking close attempt.
self._context_manager.browser = None
self._context_manager.__exit__(None, None, None)
def force_close(self) -> None:
self._closed = True
pid = self._driver_pid()
if pid is None:
log.warning("Zoopla force-close requested but Playwright driver pid is unknown")
return
_terminate_process_tree(pid, "Zoopla browser force-close")
_remove_stale_profile_locks(_zoopla_profile_dir())
def _driver_pid(self) -> int | None:
connection = getattr(self._context_manager, "_connection", None)
transport = getattr(connection, "_transport", None)
proc = getattr(transport, "_proc", None)
pid = getattr(proc, "pid", None)
return pid if isinstance(pid, int) else None
def __getattr__(self, name):
return getattr(self._browser, name)
@ -319,6 +452,161 @@ def _challenge_timeout_seconds() -> int:
return timeout
# ---------------------------------------------------------------------------
# Gluetun IP rotation
# ---------------------------------------------------------------------------
#
# When Cloudflare Turnstile fires mid-scrape, the cheapest unblocker is to
# swap the egress IP via Gluetun's HTTP control server. We stop and re-start
# the VPN, poll until the public IP changes, drop the stale cf_clearance
# cookies (bound to the previous IP), then reload and re-check the challenge.
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
def _gluetun_base_url() -> str:
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
def _gluetun_api_key() -> str | None:
return _GLUETUN_API_KEY
def _gluetun_max_rotations() -> int:
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
try:
value = int(raw)
except ValueError as exc:
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
return max(value, 0)
def _gluetun_client() -> httpx.Client:
headers = {}
api_key = _gluetun_api_key()
if api_key:
headers["X-API-Key"] = api_key
return httpx.Client(headers=headers)
def _gluetun_public_ip(client: httpx.Client) -> str | None:
try:
resp = client.get(f"{_gluetun_base_url()}/v1/publicip/ip", timeout=5.0)
if resp.status_code != 200:
return None
data = resp.json()
except (httpx.HTTPError, ValueError):
return None
return data.get("public_ip") or data.get("ip")
def _gluetun_set_vpn_status(client: httpx.Client, status: str) -> bool:
"""PUT /v1/vpn/status with {'status': status}. Returns True on 2xx."""
try:
resp = client.put(
f"{_gluetun_base_url()}/v1/vpn/status",
json={"status": status},
timeout=15.0,
)
except httpx.HTTPError as exc:
log.warning("Gluetun vpn/status %s failed: %s", status, exc)
return False
if resp.status_code == 401:
log.warning(
"Gluetun vpn/status %s: 401 Unauthorized — the API key must be "
"authorised for 'PUT /v1/vpn/status' in Gluetun's auth config.toml",
status,
)
return False
if resp.status_code >= 400:
log.warning(
"Gluetun vpn/status %s returned HTTP %d: %s",
status, resp.status_code, resp.text[:200],
)
return False
return True
def _rotate_gluetun_ip(wait_seconds: int = 45) -> bool:
"""Restart Gluetun's VPN and wait for the public IP to change.
Returns True if a new IP was observed within wait_seconds."""
with _gluetun_client() as client:
old_ip = _gluetun_public_ip(client)
log.info("Requesting Gluetun IP rotation (current IP: %s)", old_ip or "unknown")
stop_attempted = False
restart_confirmed = False
try:
stop_attempted = True
if not _gluetun_set_vpn_status(client, "stopped"):
return False
time.sleep(2)
restart_confirmed = _gluetun_set_vpn_status(client, "running")
if not restart_confirmed:
return False
deadline = time.monotonic() + wait_seconds
while time.monotonic() < deadline:
time.sleep(2)
new_ip = _gluetun_public_ip(client)
if new_ip and new_ip != old_ip:
log.info("Gluetun rotated IP: %s -> %s", old_ip or "?", new_ip)
return True
finally:
if stop_attempted and not restart_confirmed:
log.warning(
"Gluetun VPN may be stopped after failed rotation; attempting recovery start"
)
if not _gluetun_set_vpn_status(client, "running"):
log.error(
"Gluetun VPN recovery start failed; manual intervention required"
)
log.warning("Gluetun IP did not change within %ds", wait_seconds)
return False
def _clear_cloudflare_cookies(page) -> None:
"""Drop cf_clearance / __cf_bm which are bound to the previous egress IP."""
try:
context = page.context
except Exception:
return
for name in ("cf_clearance", "__cf_bm"):
try:
context.clear_cookies(name=name)
except Exception as exc:
log.debug("Could not clear cookie %s: %s", name, exc)
def _rotate_and_retry_challenge(page, max_rotations: int) -> bool:
"""Rotate IP and reload until the challenge clears. Returns True on success."""
for attempt in range(1, max_rotations + 1):
log.warning(
"Cloudflare Turnstile challenge — rotating Gluetun IP (attempt %d/%d)",
attempt, max_rotations,
)
if not _rotate_gluetun_ip():
continue
_clear_cloudflare_cookies(page)
try:
page.reload(wait_until="domcontentloaded", timeout=30000)
except Exception as exc:
log.warning("Reload after IP rotation failed: %s", exc)
continue
time.sleep(2)
if not _is_turnstile_challenge(page):
log.info("Cloudflare challenge cleared after Gluetun rotation")
return True
return False
def _is_turnstile_challenge(page) -> bool:
try:
if "just a moment" in page.title().lower():
@ -341,18 +629,26 @@ def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
if not _is_turnstile_challenge(page):
return
# Try Gluetun IP rotation first — works in any mode and is the only option
# in headless/unattended runs where no human can click the challenge.
max_rotations = _gluetun_max_rotations()
if max_rotations > 0 and _rotate_and_retry_challenge(page, max_rotations):
return
profile_dir = _zoopla_profile_dir()
if headless_mode is True or headless_mode == "virtual":
raise TurnstileError(
"Cloudflare Turnstile requires a visible browser session. "
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
f"the solved session will be saved in {profile_dir}."
"Cloudflare Turnstile persisted after "
f"{max_rotations} Gluetun IP rotation(s). "
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0 "
f"to solve manually; the session will be saved in {profile_dir}."
)
timeout = _challenge_timeout_seconds()
log.warning(
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
"window; waiting up to %ds. Profile: %s",
"Gluetun rotation insufficient — falling back to interactive solve. "
"Complete the Cloudflare challenge in the Zoopla browser window; "
"waiting up to %ds. Profile: %s",
timeout,
profile_dir,
)
@ -390,6 +686,7 @@ def launch_browser():
headless_mode = _zoopla_headless_mode()
profile_dir = _zoopla_profile_dir()
profile_dir.mkdir(parents=True, exist_ok=True)
_remove_stale_profile_locks(profile_dir)
log.info(
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
@ -471,8 +768,11 @@ def _navigate_search(page, outcode: str) -> bool:
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as exc:
log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
return False
detail = _exception_detail(exc)
log.warning("Zoopla direct navigation failed for %s: %s", outcode, detail)
raise RuntimeError(
f"Zoopla direct navigation failed for {outcode}: {detail}"
) from exc
_ensure_not_challenged(page)
@ -560,8 +860,10 @@ def _find_next_page_url(page) -> str | None:
return href;
}"""
)
except Exception:
return None
except Exception as exc:
detail = _exception_detail(exc)
log.warning("Zoopla next-page detection failed: %s", detail)
raise RuntimeError(f"Zoopla next-page detection failed: {detail}") from exc
if not href:
return None
return urljoin(ZOOPLA_BASE, href)
@ -609,8 +911,9 @@ def _extract_listings(page) -> list[dict]:
return listings
except Exception as e:
log.warning("Failed to extract listings from DOM: %s", e)
return []
detail = _exception_detail(e)
log.warning("Failed to extract listings from DOM: %s", detail)
raise RuntimeError(f"Zoopla DOM extraction failed: {detail}") from e
def _paginate(
@ -649,8 +952,15 @@ def _paginate(
except TurnstileError:
raise
except Exception as e:
log.debug("Pagination navigation failed at page %d: %s", page_num, e)
break
detail = _exception_detail(e)
log.warning(
"Zoopla pagination navigation failed at page %d: %s",
page_num,
detail,
)
raise RuntimeError(
f"Zoopla pagination navigation failed at page {page_num}: {detail}"
) from e
page_listings = _extract_listings(page)
if not page_listings: