This commit is contained in:
Andras Schmelczer 2026-03-12 22:11:00 +00:00
parent 14a3555cf1
commit 7e92bf112e
34 changed files with 1214437 additions and 224 deletions

View file

@ -1,10 +1,14 @@
import json
import logging
import os
import random
import re
import time
from urllib.parse import unquote
import httpx
from curl_cffi.requests import Session
from curl_cffi.requests.errors import RequestsError
from constants import (
DELAY_BETWEEN_PAGES,
@ -66,19 +70,18 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
raw_cookies = solution.get("cookies", [])
user_agent = solution.get("userAgent", "")
# Pass through ALL cookies from FlareSolverr — different Cloudflare
# configurations set different cookies (cf_clearance only appears when
# a challenge is triggered; it's not needed if no challenge was detected)
cookies = {}
for c in raw_cookies:
name = c.get("name", "")
if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"):
if name:
cookies[name] = c["value"]
if "cf_clearance" not in cookies:
log.error("FlareSolverr solved but no cf_clearance cookie returned")
flaresolverr_attempts_total.labels(result="no_cf_clearance").inc()
return None
if "homecouk_session" not in cookies:
log.error("FlareSolverr solved but no homecouk_session cookie returned")
flaresolverr_attempts_total.labels(result="no_session").inc()
if not cookies:
log.error("FlareSolverr solved but returned no cookies at all")
flaresolverr_attempts_total.labels(result="no_cookies").inc()
return None
log.info(
@ -121,19 +124,25 @@ def load_cookies() -> tuple[dict[str, str], str] | None:
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client:
"""Create an httpx Client configured for home.co.uk API calls.
user_agent must match the one used when obtaining cf_clearance."""
return httpx.Client(
timeout=30,
cookies=cookies,
headers={
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
},
follow_redirects=True,
)
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for home.co.uk API calls.
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
})
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
# server rejects every request with 419/403.
xsrf = cookies.get("XSRF-TOKEN")
if xsrf:
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
for name, value in cookies.items():
session.cookies.set(name, value, domain="home.co.uk")
return session
def _status_label(code: int) -> str:
@ -143,16 +152,21 @@ def _status_label(code: int) -> str:
def fetch_page(
client: httpx.Client, url: str, params: dict, max_retries: int = 3
client: Session, url: str, params: dict, max_retries: int = 3
) -> dict | None:
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
403 means cookies expired raises CookiesExpiredError immediately."""
for attempt in range(max_retries):
try:
resp = client.get(url, params=params)
resp = client.get(url, params=params, timeout=30)
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
if resp.status_code == 200:
return resp.json()
try:
return resp.json()
except json.JSONDecodeError:
homecouk_errors_total.labels(type="json_decode").inc()
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
return None
if resp.status_code == 403:
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
if resp.status_code in (429, 500, 502, 503, 504):
@ -167,10 +181,7 @@ def fetch_page(
return None
except CookiesExpiredError:
raise
except (
httpx.ConnectError, httpx.ReadTimeout,
httpx.WriteTimeout, httpx.PoolTimeout,
) as e:
except RequestsError as e:
homecouk_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
@ -285,7 +296,7 @@ def transform_property(
def search_outcode(
client: httpx.Client,
client: Session,
outcode: str,
channel: str,
pc_index: PostcodeSpatialIndex,

View file

@ -7,7 +7,7 @@ from pathlib import Path
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE
from homecouk import load_cookies as load_homecouk_cookies
from rightmove import outcode_cache
from scraper import (
@ -46,7 +46,8 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s)",
len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK)
# ---------------------------------------------------------------------------
# Scheduler
@ -137,11 +138,13 @@ def get_status():
@app.route("/debug")
def get_debug():
hk_cookies = load_homecouk_cookies()
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
return jsonify({
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
"homecouk_enabled": hk_cookies is not None,
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"homecouk_cookies_available": hk_cookies is not None,
})

View file

@ -5,6 +5,7 @@ requires-python = ">=3.12"
dependencies = [
"flask",
"httpx",
"curl_cffi",
"polars",
"fake-useragent>=2.2.0",
"prometheus-client",

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE, SEED
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
@ -126,18 +126,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK:
log.warning("Both SCRAPE_RIGHTMOVE and SCRAPE_HOMECOUK are disabled — nothing to do")
with status_lock:
status.state = "done"
status.finished_at = time.time()
_sync_gauges()
return
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
hk_result = load_homecouk_cookies()
hk_client = make_homecouk_client(*hk_result) if hk_result else None
if hk_client:
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
client = make_client() if SCRAPE_RIGHTMOVE else None
if not SCRAPE_RIGHTMOVE:
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
hk_client = None
hk_failed = False
if not SCRAPE_HOMECOUK:
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
homecouk_enabled.set(0)
hk_failed = False # set to True on 403 to skip remaining outcodes
else:
hk_result = load_homecouk_cookies()
hk_client = make_homecouk_client(*hk_result) if hk_result else None
if hk_client:
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
homecouk_enabled.set(0)
try:
for channel_cfg in CHANNELS:
@ -167,24 +182,25 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
outcode, i + 1, len(shuffled), len(all_properties))
# --- Rightmove ---
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
else:
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_dedup_keys.add(_dedup_key(p))
rm_count += 1
except Exception as e:
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="rightmove").inc()
with status_lock:
status.errors.append(msg)
if SCRAPE_RIGHTMOVE:
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
else:
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_dedup_keys.add(_dedup_key(p))
rm_count += 1
except Exception as e:
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="rightmove").inc()
with status_lock:
status.errors.append(msg)
# --- home.co.uk ---
if hk_client and not hk_failed:
@ -276,6 +292,7 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
status.finished_at = time.time()
_sync_gauges()
finally:
client.close()
if client:
client.close()
if hk_client:
hk_client.close()

View file

@ -58,7 +58,7 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
def normalize_price(amount: int, frequency: str) -> int:
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
"""Normalise price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
if frequency == "weekly":
return round(amount * 52 / 12)
if frequency == "yearly":
@ -111,7 +111,7 @@ def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex)
"lat": lat,
"Postcode": postcode,
"Address per Property Register": prop.get("displayAddress", ""),
"Leashold/Freehold": extract_tenure(prop.get("tenure")),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": sub_type or "Unknown",
"price": price,