91 lines
3.3 KiB
Python
91 lines
3.3 KiB
Python
"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
|
|
|
|
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
|
|
undetected browser to pass Cloudflare's challenge and returns the fully
|
|
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
|
|
the Gluetun VPN network namespace, so its browser egresses through the VPN.
|
|
|
|
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
|
|
provided a reused session and a generous maxTimeout (~120s) — the first
|
|
challenge solve is slow, subsequent requests on the warm session are fast.
|
|
"""
|
|
|
|
import logging
|
|
|
|
import httpx
|
|
|
|
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
|
|
|
|
log = logging.getLogger("flaresolverr")
|
|
|
|
|
|
class FlareSolverrError(Exception):
|
|
"""Raised when FlareSolverr cannot fetch/solve a URL."""
|
|
|
|
|
|
class FlareSolverrSession:
|
|
"""A reusable FlareSolverr browser session (context manager).
|
|
|
|
Reusing one session keeps the cleared Cloudflare cookies warm across
|
|
requests, so only the first fetch pays the full challenge-solve cost."""
|
|
|
|
def __init__(
|
|
self,
|
|
url: str = FLARESOLVERR_URL,
|
|
session: str = "finder",
|
|
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
|
|
) -> None:
|
|
self._url = url
|
|
self._session = session
|
|
self._max_timeout = max_timeout_ms
|
|
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
|
|
# for up to maxTimeout while solving before responding).
|
|
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
|
|
self._active = False
|
|
|
|
def _post(self, payload: dict) -> dict:
|
|
try:
|
|
resp = self._client.post(self._url, json=payload)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except (httpx.HTTPError, ValueError) as exc:
|
|
raise FlareSolverrError(
|
|
f"FlareSolverr request to {self._url} failed: {exc}"
|
|
) from exc
|
|
if data.get("status") != "ok":
|
|
raise FlareSolverrError(
|
|
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
|
|
)
|
|
return data
|
|
|
|
def __enter__(self) -> "FlareSolverrSession":
|
|
# Start from a clean session (ignore destroy errors for a fresh name).
|
|
try:
|
|
self._post({"cmd": "sessions.destroy", "session": self._session})
|
|
except FlareSolverrError:
|
|
pass
|
|
self._post({"cmd": "sessions.create", "session": self._session})
|
|
self._active = True
|
|
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
|
|
return self
|
|
|
|
def get(self, url: str) -> str:
|
|
"""Fetch a URL through FlareSolverr; return the solved HTML."""
|
|
data = self._post(
|
|
{
|
|
"cmd": "request.get",
|
|
"session": self._session,
|
|
"url": url,
|
|
"maxTimeout": self._max_timeout,
|
|
}
|
|
)
|
|
solution = data.get("solution") or {}
|
|
return solution.get("response", "") or ""
|
|
|
|
def __exit__(self, *exc_info) -> None:
|
|
if self._active:
|
|
try:
|
|
self._post({"cmd": "sessions.destroy", "session": self._session})
|
|
except FlareSolverrError as exc:
|
|
log.debug("FlareSolverr session destroy failed: %s", exc)
|
|
self._client.close()
|