has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
344
finder/zoopla.py
344
finder/zoopla.py
|
|
@ -21,11 +21,14 @@ Architecture:
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||
|
||||
import httpx
|
||||
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
|
|
@ -43,6 +46,119 @@ class TurnstileError(Exception):
|
|||
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
||||
|
||||
|
||||
def _pid_exists(pid: int) -> bool:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
def _proc_ppid(pid: int) -> int | None:
|
||||
try:
|
||||
for line in Path(f"/proc/{pid}/status").read_text().splitlines():
|
||||
if line.startswith("PPid:"):
|
||||
return int(line.split()[1])
|
||||
except (OSError, ValueError):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _proc_descendants(root_pid: int) -> set[int]:
|
||||
proc_root = Path("/proc")
|
||||
if not proc_root.exists():
|
||||
return set()
|
||||
|
||||
children: dict[int, list[int]] = {}
|
||||
for path in proc_root.iterdir():
|
||||
if not path.name.isdigit():
|
||||
continue
|
||||
pid = int(path.name)
|
||||
ppid = _proc_ppid(pid)
|
||||
if ppid is not None:
|
||||
children.setdefault(ppid, []).append(pid)
|
||||
|
||||
descendants: set[int] = set()
|
||||
stack = list(children.get(root_pid, []))
|
||||
while stack:
|
||||
pid = stack.pop()
|
||||
if pid in descendants:
|
||||
continue
|
||||
descendants.add(pid)
|
||||
stack.extend(children.get(pid, []))
|
||||
return descendants
|
||||
|
||||
|
||||
def _terminate_process_tree(root_pid: int, label: str) -> None:
|
||||
if root_pid <= 0 or root_pid == os.getpid():
|
||||
return
|
||||
|
||||
pids = _proc_descendants(root_pid) | {root_pid}
|
||||
for sig, sig_name, delay in (
|
||||
(signal.SIGTERM, "SIGTERM", 1.0),
|
||||
(signal.SIGKILL, "SIGKILL", 0.5),
|
||||
):
|
||||
alive = [pid for pid in sorted(pids, reverse=True) if _pid_exists(pid)]
|
||||
if not alive:
|
||||
return
|
||||
log.warning("%s: sending %s to %d process(es)", label, sig_name, len(alive))
|
||||
for pid in alive:
|
||||
try:
|
||||
os.kill(pid, sig)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
except OSError as exc:
|
||||
log.debug("%s: could not signal pid %d: %s", label, pid, exc)
|
||||
time.sleep(delay)
|
||||
|
||||
alive = [pid for pid in sorted(pids) if _pid_exists(pid)]
|
||||
if alive:
|
||||
log.warning("%s: process(es) still alive after force close: %s", label, alive)
|
||||
|
||||
|
||||
def _process_cmdline(pid: int) -> str:
|
||||
try:
|
||||
raw = Path(f"/proc/{pid}/cmdline").read_bytes()
|
||||
except OSError:
|
||||
return ""
|
||||
return raw.replace(b"\0", b" ").decode(errors="replace")
|
||||
|
||||
|
||||
def _profile_in_live_process(profile_dir: Path) -> bool:
|
||||
proc_root = Path("/proc")
|
||||
if not proc_root.exists():
|
||||
return False
|
||||
|
||||
needle = str(profile_dir)
|
||||
for path in proc_root.iterdir():
|
||||
if path.name.isdigit() and needle in _process_cmdline(int(path.name)):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _remove_stale_profile_locks(profile_dir: Path) -> None:
|
||||
if _profile_in_live_process(profile_dir):
|
||||
return
|
||||
|
||||
for name in (".parentlock", "parent.lock", "lock"):
|
||||
lock_path = profile_dir / name
|
||||
try:
|
||||
if lock_path.exists() or lock_path.is_symlink():
|
||||
lock_path.unlink()
|
||||
log.warning("Removed stale Zoopla profile lock: %s", lock_path)
|
||||
except OSError as exc:
|
||||
log.debug("Could not remove Zoopla profile lock %s: %s", lock_path, exc)
|
||||
|
||||
|
||||
def _exception_detail(exc: BaseException) -> str:
|
||||
detail = " ".join(str(exc).split())
|
||||
if not detail:
|
||||
detail = repr(exc)
|
||||
return f"{type(exc).__name__}: {detail}"
|
||||
|
||||
|
||||
class _ManagedCamoufoxBrowser:
|
||||
def __init__(self, context_manager, browser):
|
||||
self._context_manager = context_manager
|
||||
|
|
@ -53,10 +169,27 @@ class _ManagedCamoufoxBrowser:
|
|||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
try:
|
||||
self._browser.close()
|
||||
finally:
|
||||
self._context_manager.__exit__(None, None, None)
|
||||
self._browser.close()
|
||||
# Camoufox.__exit__ calls browser.close() itself. The context is already
|
||||
# closed here, so clear it to avoid a second blocking close attempt.
|
||||
self._context_manager.browser = None
|
||||
self._context_manager.__exit__(None, None, None)
|
||||
|
||||
def force_close(self) -> None:
|
||||
self._closed = True
|
||||
pid = self._driver_pid()
|
||||
if pid is None:
|
||||
log.warning("Zoopla force-close requested but Playwright driver pid is unknown")
|
||||
return
|
||||
_terminate_process_tree(pid, "Zoopla browser force-close")
|
||||
_remove_stale_profile_locks(_zoopla_profile_dir())
|
||||
|
||||
def _driver_pid(self) -> int | None:
|
||||
connection = getattr(self._context_manager, "_connection", None)
|
||||
transport = getattr(connection, "_transport", None)
|
||||
proc = getattr(transport, "_proc", None)
|
||||
pid = getattr(proc, "pid", None)
|
||||
return pid if isinstance(pid, int) else None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._browser, name)
|
||||
|
|
@ -319,6 +452,161 @@ def _challenge_timeout_seconds() -> int:
|
|||
return timeout
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gluetun IP rotation
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# When Cloudflare Turnstile fires mid-scrape, the cheapest unblocker is to
|
||||
# swap the egress IP via Gluetun's HTTP control server. We stop and re-start
|
||||
# the VPN, poll until the public IP changes, drop the stale cf_clearance
|
||||
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
||||
|
||||
|
||||
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
||||
|
||||
|
||||
def _gluetun_base_url() -> str:
|
||||
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
|
||||
|
||||
|
||||
def _gluetun_api_key() -> str | None:
|
||||
return _GLUETUN_API_KEY
|
||||
|
||||
|
||||
def _gluetun_max_rotations() -> int:
|
||||
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
|
||||
try:
|
||||
value = int(raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
|
||||
return max(value, 0)
|
||||
|
||||
|
||||
def _gluetun_client() -> httpx.Client:
|
||||
headers = {}
|
||||
api_key = _gluetun_api_key()
|
||||
if api_key:
|
||||
headers["X-API-Key"] = api_key
|
||||
return httpx.Client(headers=headers)
|
||||
|
||||
|
||||
def _gluetun_public_ip(client: httpx.Client) -> str | None:
|
||||
try:
|
||||
resp = client.get(f"{_gluetun_base_url()}/v1/publicip/ip", timeout=5.0)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
data = resp.json()
|
||||
except (httpx.HTTPError, ValueError):
|
||||
return None
|
||||
return data.get("public_ip") or data.get("ip")
|
||||
|
||||
|
||||
def _gluetun_set_vpn_status(client: httpx.Client, status: str) -> bool:
|
||||
"""PUT /v1/vpn/status with {'status': status}. Returns True on 2xx."""
|
||||
try:
|
||||
resp = client.put(
|
||||
f"{_gluetun_base_url()}/v1/vpn/status",
|
||||
json={"status": status},
|
||||
timeout=15.0,
|
||||
)
|
||||
except httpx.HTTPError as exc:
|
||||
log.warning("Gluetun vpn/status %s failed: %s", status, exc)
|
||||
return False
|
||||
if resp.status_code == 401:
|
||||
log.warning(
|
||||
"Gluetun vpn/status %s: 401 Unauthorized — the API key must be "
|
||||
"authorised for 'PUT /v1/vpn/status' in Gluetun's auth config.toml",
|
||||
status,
|
||||
)
|
||||
return False
|
||||
if resp.status_code >= 400:
|
||||
log.warning(
|
||||
"Gluetun vpn/status %s returned HTTP %d: %s",
|
||||
status, resp.status_code, resp.text[:200],
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _rotate_gluetun_ip(wait_seconds: int = 45) -> bool:
|
||||
"""Restart Gluetun's VPN and wait for the public IP to change.
|
||||
|
||||
Returns True if a new IP was observed within wait_seconds."""
|
||||
with _gluetun_client() as client:
|
||||
old_ip = _gluetun_public_ip(client)
|
||||
log.info("Requesting Gluetun IP rotation (current IP: %s)", old_ip or "unknown")
|
||||
|
||||
stop_attempted = False
|
||||
restart_confirmed = False
|
||||
try:
|
||||
stop_attempted = True
|
||||
if not _gluetun_set_vpn_status(client, "stopped"):
|
||||
return False
|
||||
time.sleep(2)
|
||||
restart_confirmed = _gluetun_set_vpn_status(client, "running")
|
||||
if not restart_confirmed:
|
||||
return False
|
||||
|
||||
deadline = time.monotonic() + wait_seconds
|
||||
while time.monotonic() < deadline:
|
||||
time.sleep(2)
|
||||
new_ip = _gluetun_public_ip(client)
|
||||
if new_ip and new_ip != old_ip:
|
||||
log.info("Gluetun rotated IP: %s -> %s", old_ip or "?", new_ip)
|
||||
return True
|
||||
finally:
|
||||
if stop_attempted and not restart_confirmed:
|
||||
log.warning(
|
||||
"Gluetun VPN may be stopped after failed rotation; attempting recovery start"
|
||||
)
|
||||
if not _gluetun_set_vpn_status(client, "running"):
|
||||
log.error(
|
||||
"Gluetun VPN recovery start failed; manual intervention required"
|
||||
)
|
||||
|
||||
log.warning("Gluetun IP did not change within %ds", wait_seconds)
|
||||
return False
|
||||
|
||||
|
||||
def _clear_cloudflare_cookies(page) -> None:
|
||||
"""Drop cf_clearance / __cf_bm which are bound to the previous egress IP."""
|
||||
try:
|
||||
context = page.context
|
||||
except Exception:
|
||||
return
|
||||
for name in ("cf_clearance", "__cf_bm"):
|
||||
try:
|
||||
context.clear_cookies(name=name)
|
||||
except Exception as exc:
|
||||
log.debug("Could not clear cookie %s: %s", name, exc)
|
||||
|
||||
|
||||
def _rotate_and_retry_challenge(page, max_rotations: int) -> bool:
|
||||
"""Rotate IP and reload until the challenge clears. Returns True on success."""
|
||||
for attempt in range(1, max_rotations + 1):
|
||||
log.warning(
|
||||
"Cloudflare Turnstile challenge — rotating Gluetun IP (attempt %d/%d)",
|
||||
attempt, max_rotations,
|
||||
)
|
||||
if not _rotate_gluetun_ip():
|
||||
continue
|
||||
|
||||
_clear_cloudflare_cookies(page)
|
||||
|
||||
try:
|
||||
page.reload(wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as exc:
|
||||
log.warning("Reload after IP rotation failed: %s", exc)
|
||||
continue
|
||||
|
||||
time.sleep(2)
|
||||
if not _is_turnstile_challenge(page):
|
||||
log.info("Cloudflare challenge cleared after Gluetun rotation")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _is_turnstile_challenge(page) -> bool:
|
||||
try:
|
||||
if "just a moment" in page.title().lower():
|
||||
|
|
@ -341,18 +629,26 @@ def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
|||
if not _is_turnstile_challenge(page):
|
||||
return
|
||||
|
||||
# Try Gluetun IP rotation first — works in any mode and is the only option
|
||||
# in headless/unattended runs where no human can click the challenge.
|
||||
max_rotations = _gluetun_max_rotations()
|
||||
if max_rotations > 0 and _rotate_and_retry_challenge(page, max_rotations):
|
||||
return
|
||||
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
if headless_mode is True or headless_mode == "virtual":
|
||||
raise TurnstileError(
|
||||
"Cloudflare Turnstile requires a visible browser session. "
|
||||
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
|
||||
f"the solved session will be saved in {profile_dir}."
|
||||
"Cloudflare Turnstile persisted after "
|
||||
f"{max_rotations} Gluetun IP rotation(s). "
|
||||
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0 "
|
||||
f"to solve manually; the session will be saved in {profile_dir}."
|
||||
)
|
||||
|
||||
timeout = _challenge_timeout_seconds()
|
||||
log.warning(
|
||||
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
|
||||
"window; waiting up to %ds. Profile: %s",
|
||||
"Gluetun rotation insufficient — falling back to interactive solve. "
|
||||
"Complete the Cloudflare challenge in the Zoopla browser window; "
|
||||
"waiting up to %ds. Profile: %s",
|
||||
timeout,
|
||||
profile_dir,
|
||||
)
|
||||
|
|
@ -390,6 +686,7 @@ def launch_browser():
|
|||
headless_mode = _zoopla_headless_mode()
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
_remove_stale_profile_locks(profile_dir)
|
||||
|
||||
log.info(
|
||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
||||
|
|
@ -471,8 +768,11 @@ def _navigate_search(page, outcode: str) -> bool:
|
|||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as exc:
|
||||
log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
|
||||
return False
|
||||
detail = _exception_detail(exc)
|
||||
log.warning("Zoopla direct navigation failed for %s: %s", outcode, detail)
|
||||
raise RuntimeError(
|
||||
f"Zoopla direct navigation failed for {outcode}: {detail}"
|
||||
) from exc
|
||||
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
|
|
@ -560,8 +860,10 @@ def _find_next_page_url(page) -> str | None:
|
|||
return href;
|
||||
}"""
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
except Exception as exc:
|
||||
detail = _exception_detail(exc)
|
||||
log.warning("Zoopla next-page detection failed: %s", detail)
|
||||
raise RuntimeError(f"Zoopla next-page detection failed: {detail}") from exc
|
||||
if not href:
|
||||
return None
|
||||
return urljoin(ZOOPLA_BASE, href)
|
||||
|
|
@ -609,8 +911,9 @@ def _extract_listings(page) -> list[dict]:
|
|||
|
||||
return listings
|
||||
except Exception as e:
|
||||
log.warning("Failed to extract listings from DOM: %s", e)
|
||||
return []
|
||||
detail = _exception_detail(e)
|
||||
log.warning("Failed to extract listings from DOM: %s", detail)
|
||||
raise RuntimeError(f"Zoopla DOM extraction failed: {detail}") from e
|
||||
|
||||
|
||||
def _paginate(
|
||||
|
|
@ -649,8 +952,15 @@ def _paginate(
|
|||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.debug("Pagination navigation failed at page %d: %s", page_num, e)
|
||||
break
|
||||
detail = _exception_detail(e)
|
||||
log.warning(
|
||||
"Zoopla pagination navigation failed at page %d: %s",
|
||||
page_num,
|
||||
detail,
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"Zoopla pagination navigation failed at page {page_num}: {detail}"
|
||||
) from e
|
||||
|
||||
page_listings = _extract_listings(page)
|
||||
if not page_listings:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue