Working
This commit is contained in:
parent
14a3555cf1
commit
7e92bf112e
34 changed files with 1214437 additions and 224 deletions
|
|
@ -1,10 +1,14 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import unquote
|
||||
|
||||
import httpx
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
|
|
@ -66,19 +70,18 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
|
|||
raw_cookies = solution.get("cookies", [])
|
||||
user_agent = solution.get("userAgent", "")
|
||||
|
||||
# Pass through ALL cookies from FlareSolverr — different Cloudflare
|
||||
# configurations set different cookies (cf_clearance only appears when
|
||||
# a challenge is triggered; it's not needed if no challenge was detected)
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
name = c.get("name", "")
|
||||
if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"):
|
||||
if name:
|
||||
cookies[name] = c["value"]
|
||||
|
||||
if "cf_clearance" not in cookies:
|
||||
log.error("FlareSolverr solved but no cf_clearance cookie returned")
|
||||
flaresolverr_attempts_total.labels(result="no_cf_clearance").inc()
|
||||
return None
|
||||
if "homecouk_session" not in cookies:
|
||||
log.error("FlareSolverr solved but no homecouk_session cookie returned")
|
||||
flaresolverr_attempts_total.labels(result="no_session").inc()
|
||||
if not cookies:
|
||||
log.error("FlareSolverr solved but returned no cookies at all")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
|
|
@ -121,19 +124,25 @@ def load_cookies() -> tuple[dict[str, str], str] | None:
|
|||
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client:
|
||||
"""Create an httpx Client configured for home.co.uk API calls.
|
||||
user_agent must match the one used when obtaining cf_clearance."""
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
cookies=cookies,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for home.co.uk API calls.
|
||||
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
|
||||
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update({
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
})
|
||||
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
|
||||
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
|
||||
# server rejects every request with 419/403.
|
||||
xsrf = cookies.get("XSRF-TOKEN")
|
||||
if xsrf:
|
||||
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="home.co.uk")
|
||||
return session
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
|
|
@ -143,16 +152,21 @@ def _status_label(code: int) -> str:
|
|||
|
||||
|
||||
def fetch_page(
|
||||
client: httpx.Client, url: str, params: dict, max_retries: int = 3
|
||||
client: Session, url: str, params: dict, max_retries: int = 3
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
|
||||
403 means cookies expired — raises CookiesExpiredError immediately."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
resp = client.get(url, params=params, timeout=30)
|
||||
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
try:
|
||||
return resp.json()
|
||||
except json.JSONDecodeError:
|
||||
homecouk_errors_total.labels(type="json_decode").inc()
|
||||
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
|
||||
return None
|
||||
if resp.status_code == 403:
|
||||
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
|
|
@ -167,10 +181,7 @@ def fetch_page(
|
|||
return None
|
||||
except CookiesExpiredError:
|
||||
raise
|
||||
except (
|
||||
httpx.ConnectError, httpx.ReadTimeout,
|
||||
httpx.WriteTimeout, httpx.PoolTimeout,
|
||||
) as e:
|
||||
except RequestsError as e:
|
||||
homecouk_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
|
|
@ -285,7 +296,7 @@ def transform_property(
|
|||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
client: Session,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
|||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
|
||||
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from rightmove import outcode_cache
|
||||
from scraper import (
|
||||
|
|
@ -46,7 +46,8 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|||
log.info("Loading arcgis data...")
|
||||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
|
||||
log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s)",
|
||||
len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scheduler
|
||||
|
|
@ -137,11 +138,13 @@ def get_status():
|
|||
|
||||
@app.route("/debug")
|
||||
def get_debug():
|
||||
hk_cookies = load_homecouk_cookies()
|
||||
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
|
||||
return jsonify({
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
"homecouk_enabled": hk_cookies is not None,
|
||||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
})
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ requires-python = ">=3.12"
|
|||
dependencies = [
|
||||
"flask",
|
||||
"httpx",
|
||||
"curl_cffi",
|
||||
"polars",
|
||||
"fake-useragent>=2.2.0",
|
||||
"prometheus-client",
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from dataclasses import dataclass, field
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE, SEED
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
|
|
@ -126,18 +126,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK:
|
||||
log.warning("Both SCRAPE_RIGHTMOVE and SCRAPE_HOMECOUK are disabled — nothing to do")
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
return
|
||||
|
||||
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
|
||||
hk_result = load_homecouk_cookies()
|
||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||
if hk_client:
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
client = make_client() if SCRAPE_RIGHTMOVE else None
|
||||
if not SCRAPE_RIGHTMOVE:
|
||||
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
|
||||
|
||||
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
|
||||
hk_client = None
|
||||
hk_failed = False
|
||||
if not SCRAPE_HOMECOUK:
|
||||
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
|
||||
homecouk_enabled.set(0)
|
||||
hk_failed = False # set to True on 403 to skip remaining outcodes
|
||||
else:
|
||||
hk_result = load_homecouk_cookies()
|
||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||
if hk_client:
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
homecouk_enabled.set(0)
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
|
|
@ -167,24 +182,25 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
# --- Rightmove ---
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(_dedup_key(p))
|
||||
rm_count += 1
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="rightmove").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
if SCRAPE_RIGHTMOVE:
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(_dedup_key(p))
|
||||
rm_count += 1
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="rightmove").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
# --- home.co.uk ---
|
||||
if hk_client and not hk_failed:
|
||||
|
|
@ -276,6 +292,7 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
finally:
|
||||
client.close()
|
||||
if client:
|
||||
client.close()
|
||||
if hk_client:
|
||||
hk_client.close()
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
|||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
"""Normalise price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
if frequency == "weekly":
|
||||
return round(amount * 52 / 12)
|
||||
if frequency == "yearly":
|
||||
|
|
@ -111,7 +111,7 @@ def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex)
|
|||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Leashold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": sub_type or "Unknown",
|
||||
"price": price,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue