This commit is contained in:
Andras Schmelczer 2026-03-12 22:11:00 +00:00
parent 14a3555cf1
commit 7e92bf112e
34 changed files with 1214437 additions and 224 deletions

View file

@ -1,10 +1,14 @@
import json
import logging
import os
import random
import re
import time
from urllib.parse import unquote
import httpx
from curl_cffi.requests import Session
from curl_cffi.requests.errors import RequestsError
from constants import (
DELAY_BETWEEN_PAGES,
@ -66,19 +70,18 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
raw_cookies = solution.get("cookies", [])
user_agent = solution.get("userAgent", "")
# Pass through ALL cookies from FlareSolverr — different Cloudflare
# configurations set different cookies (cf_clearance only appears when
# a challenge is triggered; it's not needed if no challenge was detected)
cookies = {}
for c in raw_cookies:
name = c.get("name", "")
if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"):
if name:
cookies[name] = c["value"]
if "cf_clearance" not in cookies:
log.error("FlareSolverr solved but no cf_clearance cookie returned")
flaresolverr_attempts_total.labels(result="no_cf_clearance").inc()
return None
if "homecouk_session" not in cookies:
log.error("FlareSolverr solved but no homecouk_session cookie returned")
flaresolverr_attempts_total.labels(result="no_session").inc()
if not cookies:
log.error("FlareSolverr solved but returned no cookies at all")
flaresolverr_attempts_total.labels(result="no_cookies").inc()
return None
log.info(
@ -121,19 +124,25 @@ def load_cookies() -> tuple[dict[str, str], str] | None:
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client:
"""Create an httpx Client configured for home.co.uk API calls.
user_agent must match the one used when obtaining cf_clearance."""
return httpx.Client(
timeout=30,
cookies=cookies,
headers={
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
},
follow_redirects=True,
)
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for home.co.uk API calls.
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
})
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
# server rejects every request with 419/403.
xsrf = cookies.get("XSRF-TOKEN")
if xsrf:
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
for name, value in cookies.items():
session.cookies.set(name, value, domain="home.co.uk")
return session
def _status_label(code: int) -> str:
@ -143,16 +152,21 @@ def _status_label(code: int) -> str:
def fetch_page(
client: httpx.Client, url: str, params: dict, max_retries: int = 3
client: Session, url: str, params: dict, max_retries: int = 3
) -> dict | None:
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
403 means cookies expired raises CookiesExpiredError immediately."""
for attempt in range(max_retries):
try:
resp = client.get(url, params=params)
resp = client.get(url, params=params, timeout=30)
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
if resp.status_code == 200:
return resp.json()
try:
return resp.json()
except json.JSONDecodeError:
homecouk_errors_total.labels(type="json_decode").inc()
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
return None
if resp.status_code == 403:
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
if resp.status_code in (429, 500, 502, 503, 504):
@ -167,10 +181,7 @@ def fetch_page(
return None
except CookiesExpiredError:
raise
except (
httpx.ConnectError, httpx.ReadTimeout,
httpx.WriteTimeout, httpx.PoolTimeout,
) as e:
except RequestsError as e:
homecouk_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
@ -285,7 +296,7 @@ def transform_property(
def search_outcode(
client: httpx.Client,
client: Session,
outcode: str,
channel: str,
pc_index: PostcodeSpatialIndex,