This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -6,7 +6,6 @@ import re
import time
from urllib.parse import unquote
import httpx
from curl_cffi.requests import Session
from curl_cffi.requests.errors import RequestsError
@ -19,12 +18,6 @@ from constants import (
PROPERTY_TYPE_MAP,
RETRY_BASE_DELAY,
)
from metrics import (
flaresolverr_attempts_total,
homecouk_errors_total,
homecouk_properties_scraped,
homecouk_requests_total,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
@ -36,101 +29,73 @@ class CookiesExpiredError(Exception):
# Channel mapping: internal name → URL path segment
HOMECOUK_CHANNELS = {
"BUY": "for-sale",
"RENT": "to-rent",
}
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://flaresolverr:8191")
def solve_cloudflare() -> tuple[dict[str, str], str] | None:
"""Use FlareSolverr to solve the Cloudflare challenge.
Returns (cookies_dict, user_agent) or None on failure."""
log.info("Solving Cloudflare challenge via FlareSolverr at %s", FLARESOLVERR_URL)
try:
with httpx.Client(timeout=120) as client:
resp = client.post(
f"{FLARESOLVERR_URL}/v1",
json={
"cmd": "request.get",
"url": f"{HOMECOUK_BASE}/for-sale/e1/",
"maxTimeout": 60000,
},
)
if resp.status_code != 200:
log.error("FlareSolverr returned HTTP %d", resp.status_code)
return None
data = resp.json()
if data.get("status") != "ok":
log.error("FlareSolverr error: %s", data.get("message", "unknown"))
return None
solution = data["solution"]
raw_cookies = solution.get("cookies", [])
user_agent = solution.get("userAgent", "")
# Pass through ALL cookies from FlareSolverr — different Cloudflare
# configurations set different cookies (cf_clearance only appears when
# a challenge is triggered; it's not needed if no challenge was detected)
cookies = {}
for c in raw_cookies:
name = c.get("name", "")
if name:
cookies[name] = c["value"]
if not cookies:
log.error("FlareSolverr solved but returned no cookies at all")
flaresolverr_attempts_total.labels(result="no_cookies").inc()
return None
log.info(
"Cloudflare solved — got %d cookies, UA: %s",
len(cookies),
user_agent[:60],
)
flaresolverr_attempts_total.labels(result="success").inc()
return cookies, user_agent
except (httpx.ConnectError, httpx.ReadTimeout) as e:
log.warning("FlareSolverr not available: %s", e)
flaresolverr_attempts_total.labels(result="unavailable").inc()
return None
except Exception as e:
log.error("FlareSolverr error: %s", e)
flaresolverr_attempts_total.labels(result="error").inc()
return None
HOMECOUK_URL_SEGMENT = "for-sale"
def load_cookies() -> tuple[dict[str, str], str] | None:
"""Get home.co.uk cookies + user-agent.
Tries FlareSolverr first, then falls back to environment variables.
Returns (cookies_dict, user_agent) or None if not configured."""
# Try FlareSolverr first
result = solve_cloudflare()
if result:
return result
# Fall back to env vars
cf_clearance = os.environ.get("HOMECOUK_CF_CLEARANCE", "")
session = os.environ.get("HOMECOUK_SESSION", "")
if not cf_clearance or not session:
return None
Environment cookies are optional. When they are not present, bootstrap a
regular local session by visiting home.co.uk with curl_cffi's Chrome
impersonation and reusing the cookies set by the site.
"""
user_agent = os.environ.get(
"HOMECOUK_USER_AGENT",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/145.0.0.0 Safari/537.36",
)
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
env_cookies = {
name: value
for name, value in {
"cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""),
"homecouk_session": os.environ.get("HOMECOUK_SESSION", ""),
"XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""),
}.items()
if value
}
if env_cookies.get("homecouk_session"):
return env_cookies, user_agent
session = Session(impersonate="chrome")
session.headers.update(
{
"User-Agent": user_agent,
"Accept": (
"text/html,application/xhtml+xml,application/xml;q=0.9,"
"*/*;q=0.8"
),
}
)
for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"):
try:
response = session.get(url, timeout=30)
except RequestsError as exc:
log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc)
continue
if response.status_code == 403:
raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap")
if response.status_code >= 400:
log.warning(
"home.co.uk cookie bootstrap got HTTP %d from %s",
response.status_code,
url,
)
cookies = session.cookies.get_dict()
if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"):
log.info("home.co.uk local session bootstrapped")
return cookies, user_agent
log.warning("home.co.uk did not provide session cookies during bootstrap")
return None
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for home.co.uk API calls.
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
Uses Chrome TLS impersonation so browser-derived cookies remain valid."""
session = Session(impersonate="chrome")
session.headers.update(
{
@ -150,12 +115,6 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
return session
def _status_label(code: int) -> str:
if code >= 500:
return "5xx"
return str(code)
def fetch_page(
client: Session, url: str, params: dict, max_retries: int = 3
) -> dict | None:
@ -164,12 +123,10 @@ def fetch_page(
for attempt in range(max_retries):
try:
resp = client.get(url, params=params, timeout=30)
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
if resp.status_code == 200:
try:
return resp.json()
except json.JSONDecodeError:
homecouk_errors_total.labels(type="json_decode").inc()
log.error(
"Non-JSON response from %s (got %s)",
url,
@ -195,7 +152,6 @@ def fetch_page(
except CookiesExpiredError:
raise
except RequestsError as e:
homecouk_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
@ -206,7 +162,6 @@ def fetch_page(
delay,
)
time.sleep(delay)
homecouk_errors_total.labels(type="retry_exhausted").inc()
log.error("All %d retries exhausted for %s", max_retries, url)
return None
@ -301,7 +256,6 @@ def map_property_type(raw_type: str | None) -> str:
def transform_property(
prop: dict,
channel: str,
pc_index: PostcodeSpatialIndex,
) -> dict | None:
"""Transform a raw home.co.uk property dict into our output schema."""
@ -365,7 +319,7 @@ def transform_property(
"Property type": map_property_type(listing_type),
"Property sub-type": normalize_sub_type(listing_type),
"price": int(price),
"price_frequency": "" if channel == "BUY" else "monthly",
"price_frequency": "",
"Price qualifier": price_qualifier,
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
@ -377,13 +331,11 @@ def transform_property(
def search_outcode(
client: Session,
outcode: str,
channel: str,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through search results for one outcode+channel.
channel: "BUY" or "RENT".
Returns transformed properties."""
url_segment = HOMECOUK_CHANNELS[channel]
"""Paginate through sale search results for one outcode."""
url_segment = HOMECOUK_URL_SEGMENT
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
properties = []
page = 1
@ -410,12 +362,11 @@ def search_outcode(
break
for prop in raw_props:
transformed = transform_property(prop, channel, pc_index)
transformed = transform_property(prop, pc_index)
if transformed:
properties.append(transformed)
homecouk_properties_scraped.labels(
channel="buy" if channel == "BUY" else "rent",
).inc()
if max_properties is not None and len(properties) >= max_properties:
return properties
# Check pagination
pagination = data.get("pagination", {})