perfect-postcode/finder/flaresolverr.py

91 lines
3.3 KiB
Python

"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
undetected browser to pass Cloudflare's challenge and returns the fully
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
the Gluetun VPN network namespace, so its browser egresses through the VPN.
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
provided a reused session and a generous maxTimeout (~120s) — the first
challenge solve is slow, subsequent requests on the warm session are fast.
"""
import logging
import httpx
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
log = logging.getLogger("flaresolverr")
class FlareSolverrError(Exception):
"""Raised when FlareSolverr cannot fetch/solve a URL."""
class FlareSolverrSession:
"""A reusable FlareSolverr browser session (context manager).
Reusing one session keeps the cleared Cloudflare cookies warm across
requests, so only the first fetch pays the full challenge-solve cost."""
def __init__(
self,
url: str = FLARESOLVERR_URL,
session: str = "finder",
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
) -> None:
self._url = url
self._session = session
self._max_timeout = max_timeout_ms
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
# for up to maxTimeout while solving before responding).
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
self._active = False
def _post(self, payload: dict) -> dict:
try:
resp = self._client.post(self._url, json=payload)
resp.raise_for_status()
data = resp.json()
except (httpx.HTTPError, ValueError) as exc:
raise FlareSolverrError(
f"FlareSolverr request to {self._url} failed: {exc}"
) from exc
if data.get("status") != "ok":
raise FlareSolverrError(
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
)
return data
def __enter__(self) -> "FlareSolverrSession":
# Start from a clean session (ignore destroy errors for a fresh name).
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError:
pass
self._post({"cmd": "sessions.create", "session": self._session})
self._active = True
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
return self
def get(self, url: str) -> str:
"""Fetch a URL through FlareSolverr; return the solved HTML."""
data = self._post(
{
"cmd": "request.get",
"session": self._session,
"url": url,
"maxTimeout": self._max_timeout,
}
)
solution = data.get("solution") or {}
return solution.get("response", "") or ""
def __exit__(self, *exc_info) -> None:
if self._active:
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError as exc:
log.debug("FlareSolverr session destroy failed: %s", exc)
self._client.close()