Working
This commit is contained in:
parent
14a3555cf1
commit
7e92bf112e
34 changed files with 1214437 additions and 224 deletions
|
|
@ -1,10 +1,14 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import unquote
|
||||
|
||||
import httpx
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
|
|
@ -66,19 +70,18 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
|
|||
raw_cookies = solution.get("cookies", [])
|
||||
user_agent = solution.get("userAgent", "")
|
||||
|
||||
# Pass through ALL cookies from FlareSolverr — different Cloudflare
|
||||
# configurations set different cookies (cf_clearance only appears when
|
||||
# a challenge is triggered; it's not needed if no challenge was detected)
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
name = c.get("name", "")
|
||||
if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"):
|
||||
if name:
|
||||
cookies[name] = c["value"]
|
||||
|
||||
if "cf_clearance" not in cookies:
|
||||
log.error("FlareSolverr solved but no cf_clearance cookie returned")
|
||||
flaresolverr_attempts_total.labels(result="no_cf_clearance").inc()
|
||||
return None
|
||||
if "homecouk_session" not in cookies:
|
||||
log.error("FlareSolverr solved but no homecouk_session cookie returned")
|
||||
flaresolverr_attempts_total.labels(result="no_session").inc()
|
||||
if not cookies:
|
||||
log.error("FlareSolverr solved but returned no cookies at all")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
|
|
@ -121,19 +124,25 @@ def load_cookies() -> tuple[dict[str, str], str] | None:
|
|||
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client:
|
||||
"""Create an httpx Client configured for home.co.uk API calls.
|
||||
user_agent must match the one used when obtaining cf_clearance."""
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
cookies=cookies,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for home.co.uk API calls.
|
||||
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
|
||||
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update({
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
})
|
||||
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
|
||||
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
|
||||
# server rejects every request with 419/403.
|
||||
xsrf = cookies.get("XSRF-TOKEN")
|
||||
if xsrf:
|
||||
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="home.co.uk")
|
||||
return session
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
|
|
@ -143,16 +152,21 @@ def _status_label(code: int) -> str:
|
|||
|
||||
|
||||
def fetch_page(
|
||||
client: httpx.Client, url: str, params: dict, max_retries: int = 3
|
||||
client: Session, url: str, params: dict, max_retries: int = 3
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
|
||||
403 means cookies expired — raises CookiesExpiredError immediately."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
resp = client.get(url, params=params, timeout=30)
|
||||
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
try:
|
||||
return resp.json()
|
||||
except json.JSONDecodeError:
|
||||
homecouk_errors_total.labels(type="json_decode").inc()
|
||||
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
|
||||
return None
|
||||
if resp.status_code == 403:
|
||||
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
|
|
@ -167,10 +181,7 @@ def fetch_page(
|
|||
return None
|
||||
except CookiesExpiredError:
|
||||
raise
|
||||
except (
|
||||
httpx.ConnectError, httpx.ReadTimeout,
|
||||
httpx.WriteTimeout, httpx.PoolTimeout,
|
||||
) as e:
|
||||
except RequestsError as e:
|
||||
homecouk_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
|
|
@ -285,7 +296,7 @@ def transform_property(
|
|||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
client: Session,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue