This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -86,7 +86,8 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
log.info(
"Cloudflare solved — got %d cookies, UA: %s",
len(cookies), user_agent[:60],
len(cookies),
user_agent[:60],
)
flaresolverr_attempts_total.labels(result="success").inc()
return cookies, user_agent
@ -129,11 +130,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
})
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
}
)
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
# server rejects every request with 419/403.
@ -165,7 +168,11 @@ def fetch_page(
return resp.json()
except json.JSONDecodeError:
homecouk_errors_total.labels(type="json_decode").inc()
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
log.error(
"Non-JSON response from %s (got %s)",
url,
resp.headers.get("content-type", "?"),
)
return None
if resp.status_code == 403:
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
@ -173,7 +180,11 @@ def fetch_page(
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, max_retries, delay,
resp.status_code,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
continue
@ -186,7 +197,11 @@ def fetch_page(
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__, url, attempt + 1, max_retries, delay,
type(e).__name__,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
homecouk_errors_total.labels(type="retry_exhausted").inc()
@ -218,7 +233,12 @@ def map_property_type(raw_type: str | None) -> str:
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
# Try common patterns
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -231,7 +251,9 @@ def map_property_type(raw_type: str | None) -> str:
def transform_property(
prop: dict, channel: str, pc_index: PostcodeSpatialIndex,
prop: dict,
channel: str,
pc_index: PostcodeSpatialIndex,
) -> dict | None:
"""Transform a raw home.co.uk property dict into our output schema."""
lat = prop.get("latitude")