diff --git a/docker-compose.yml b/docker-compose.yml index 9c58ae6..22fb204 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -136,6 +136,17 @@ services: # devices: # - /dev/net/tun:/dev/net/tun + # flaresolverr: + # image: ghcr.io/flaresolverr/flaresolverr:latest + # environment: + # LOG_LEVEL: info + # TZ: Europe/London + # ports: + # - "8191:8191" + # networks: + # - dev-network + # restart: unless-stopped + # finder: # build: # context: . @@ -144,9 +155,13 @@ services: # network_mode: service:gluetun # volumes: # - ./finder:/app + # environment: + # FLARESOLVERR_URL: http://flaresolverr:8191 # depends_on: # gluetun: # condition: service_healthy + # flaresolverr: + # condition: service_started # restart: unless-stopped diff --git a/finder/constants.py b/finder/constants.py index 17d45a4..a1ebdcc 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -20,6 +20,11 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" RIGHTMOVE_BASE = "https://www.rightmove.co.uk" +# home.co.uk +HOMECOUK_BASE = "https://home.co.uk" +HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api" +HOMECOUK_PER_PAGE = 30 # max supported by the API + PROPERTY_TYPE_MAP = { "Detached": "Detached", "Semi-Detached": "Semi-Detached", diff --git a/finder/homecouk.py b/finder/homecouk.py new file mode 100644 index 0000000..c46e79d --- /dev/null +++ b/finder/homecouk.py @@ -0,0 +1,339 @@ +import logging +import os +import random +import re +import time + +import httpx + +from constants import ( + DELAY_BETWEEN_PAGES, + HOMECOUK_API_BASE, + HOMECOUK_BASE, + HOMECOUK_PER_PAGE, + PROPERTY_TYPE_MAP, + RETRY_BASE_DELAY, +) +from metrics import ( + flaresolverr_attempts_total, + homecouk_errors_total, + homecouk_properties_scraped, + homecouk_requests_total, +) +from spatial import PostcodeSpatialIndex + +log = logging.getLogger("homecouk") + + +class CookiesExpiredError(Exception): + """Raised when home.co.uk returns 403, indicating cookies need refresh.""" + + +# Channel mapping: internal name → URL path segment +HOMECOUK_CHANNELS = { + "BUY": "for-sale", + "RENT": "to-rent", +} + + +FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://flaresolverr:8191") + + +def solve_cloudflare() -> tuple[dict[str, str], str] | None: + """Use FlareSolverr to solve the Cloudflare challenge. + Returns (cookies_dict, user_agent) or None on failure.""" + log.info("Solving Cloudflare challenge via FlareSolverr at %s", FLARESOLVERR_URL) + try: + with httpx.Client(timeout=120) as client: + resp = client.post( + f"{FLARESOLVERR_URL}/v1", + json={ + "cmd": "request.get", + "url": f"{HOMECOUK_BASE}/for-sale/e1/", + "maxTimeout": 60000, + }, + ) + if resp.status_code != 200: + log.error("FlareSolverr returned HTTP %d", resp.status_code) + return None + + data = resp.json() + if data.get("status") != "ok": + log.error("FlareSolverr error: %s", data.get("message", "unknown")) + return None + + solution = data["solution"] + raw_cookies = solution.get("cookies", []) + user_agent = solution.get("userAgent", "") + + cookies = {} + for c in raw_cookies: + name = c.get("name", "") + if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"): + cookies[name] = c["value"] + + if "cf_clearance" not in cookies: + log.error("FlareSolverr solved but no cf_clearance cookie returned") + flaresolverr_attempts_total.labels(result="no_cf_clearance").inc() + return None + if "homecouk_session" not in cookies: + log.error("FlareSolverr solved but no homecouk_session cookie returned") + flaresolverr_attempts_total.labels(result="no_session").inc() + return None + + log.info( + "Cloudflare solved — got %d cookies, UA: %s", + len(cookies), user_agent[:60], + ) + flaresolverr_attempts_total.labels(result="success").inc() + return cookies, user_agent + + except (httpx.ConnectError, httpx.ReadTimeout) as e: + log.warning("FlareSolverr not available: %s", e) + flaresolverr_attempts_total.labels(result="unavailable").inc() + return None + except Exception as e: + log.error("FlareSolverr error: %s", e) + flaresolverr_attempts_total.labels(result="error").inc() + return None + + +def load_cookies() -> tuple[dict[str, str], str] | None: + """Get home.co.uk cookies + user-agent. + Tries FlareSolverr first, then falls back to environment variables. + Returns (cookies_dict, user_agent) or None if not configured.""" + # Try FlareSolverr first + result = solve_cloudflare() + if result: + return result + + # Fall back to env vars + cf_clearance = os.environ.get("HOMECOUK_CF_CLEARANCE", "") + session = os.environ.get("HOMECOUK_SESSION", "") + if not cf_clearance or not session: + return None + user_agent = os.environ.get( + "HOMECOUK_USER_AGENT", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/145.0.0.0 Safari/537.36", + ) + return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent + + +def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client: + """Create an httpx Client configured for home.co.uk API calls. + user_agent must match the one used when obtaining cf_clearance.""" + return httpx.Client( + timeout=30, + cookies=cookies, + headers={ + "User-Agent": user_agent, + "Accept": "application/json, text/plain, */*", + "x-requested-with": "XMLHttpRequest", + }, + follow_redirects=True, + ) + + +def _status_label(code: int) -> str: + if code >= 500: + return "5xx" + return str(code) + + +def fetch_page( + client: httpx.Client, url: str, params: dict, max_retries: int = 3 +) -> dict | None: + """GET JSON with retries on 429/5xx. Returns None on permanent failure. + 403 means cookies expired — raises CookiesExpiredError immediately.""" + for attempt in range(max_retries): + try: + resp = client.get(url, params=params) + homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc() + if resp.status_code == 200: + return resp.json() + if resp.status_code == 403: + raise CookiesExpiredError("HTTP 403 — cookies likely expired") + if resp.status_code in (429, 500, 502, 503, 504): + delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) + log.warning( + "HTTP %d from %s, retry %d/%d in %.1fs", + resp.status_code, url, attempt + 1, max_retries, delay, + ) + time.sleep(delay) + continue + log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) + return None + except CookiesExpiredError: + raise + except ( + httpx.ConnectError, httpx.ReadTimeout, + httpx.WriteTimeout, httpx.PoolTimeout, + ) as e: + homecouk_errors_total.labels(type=type(e).__name__).inc() + delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) + log.warning( + "%s from %s, retry %d/%d in %.1fs", + type(e).__name__, url, attempt + 1, max_retries, delay, + ) + time.sleep(delay) + homecouk_errors_total.labels(type="retry_exhausted").inc() + log.error("All %d retries exhausted for %s", max_retries, url) + return None + + +def parse_floor_area(description: str | None) -> float | None: + """Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'.""" + if not description: + return None + m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) + if m: + sqft = float(m.group(1).replace(",", "")) + return round(sqft * 0.092903, 1) + m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) + if m: + return round(float(m.group(1).replace(",", "")), 1) + return None + + +def map_property_type(raw_type: str | None) -> str: + """Map home.co.uk property type to canonical type.""" + if not raw_type: + return "Other" + canonical = PROPERTY_TYPE_MAP.get(raw_type) + if canonical: + return canonical + # Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc. + # Try common patterns + lower = raw_type.lower() + if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: + return "Flats/Maisonettes" + if "detached" in lower and "semi" not in lower: + return "Detached" + if "semi" in lower: + return "Semi-Detached" + if "terrace" in lower or "mews" in lower: + return "Terraced" + log.debug("Unknown property type: %r — mapping to Other", raw_type) + return "Other" + + +def transform_property( + prop: dict, channel: str, pc_index: PostcodeSpatialIndex, +) -> dict | None: + """Transform a raw home.co.uk property dict into our output schema.""" + lat = prop.get("latitude") + lng = prop.get("longitude") + if lat is None or lng is None: + return None + + # Validate coordinates are in England + if not (49 <= lat <= 56 and -7 <= lng <= 2): + log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng) + return None + + price = prop.get("price") or prop.get("latest_price") + if not price: + return None + + # Home.co.uk provides postcodes directly, but fall back to spatial index + postcode = prop.get("postcode") + if not postcode: + postcode = pc_index.nearest(lat, lng) + if not postcode: + log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng) + return None + + bedrooms = prop.get("bedrooms", 0) or 0 + bathrooms = prop.get("bathrooms", 0) or 0 + + listing_type = prop.get("listing_property_type") or prop.get("property_type") or "" + address = prop.get("display_address") or prop.get("address") or "" + + # Derive price qualifier from reduction info + price_qualifier = "" + if prop.get("is_reduced"): + pct = prop.get("reduction_percent", 0) + if pct: + price_qualifier = f"Reduced by {pct}%" + else: + price_qualifier = "Reduced" + + listing_id = prop.get("listing_id") or prop.get("property_id") or "" + + return { + "id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs + "Bedrooms": bedrooms, + "Bathrooms": bathrooms, + "Number of bedrooms & living rooms": bedrooms + bathrooms, + "lon": lng, + "lat": lat, + "Postcode": postcode, + "Address per Property Register": address, + "Leashold/Freehold": None, # not available from home.co.uk + "Property type": map_property_type(listing_type), + "Property sub-type": listing_type or "Unknown", + "price": int(price), + "price_frequency": "" if channel == "BUY" else "monthly", + "Price qualifier": price_qualifier, + "Total floor area (sqm)": parse_floor_area(prop.get("description")), + "Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}", + "Listing features": [], # not available from home.co.uk + "first_visible_date": prop.get("added_date") or "", + } + + +def search_outcode( + client: httpx.Client, + outcode: str, + channel: str, + pc_index: PostcodeSpatialIndex, +) -> list[dict]: + """Paginate through search results for one outcode+channel. + channel: "BUY" or "RENT". + Returns transformed properties.""" + url_segment = HOMECOUK_CHANNELS[channel] + url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/" + properties = [] + page = 1 + + while True: + params = { + "page": str(page), + "sort": "date_desc", + "per_page": str(HOMECOUK_PER_PAGE), + } + + # Set referer to match the page URL pattern + client.headers["referer"] = ( + f"https://home.co.uk/{url_segment}/{outcode.lower()}/" + f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}" + ) + + data = fetch_page(client, url, params) + if not data: + break + + raw_props = data.get("properties", []) + if not raw_props: + break + + for prop in raw_props: + transformed = transform_property(prop, channel, pc_index) + if transformed: + properties.append(transformed) + homecouk_properties_scraped.labels( + channel="buy" if channel == "BUY" else "rent", + ).inc() + + # Check pagination + pagination = data.get("pagination", {}) + last_page = pagination.get("last_page", 1) + if page >= last_page: + break + + page += 1 + time.sleep(DELAY_BETWEEN_PAGES) + + return properties diff --git a/finder/main.py b/finder/main.py index 114cf57..9ea3d21 100644 --- a/finder/main.py +++ b/finder/main.py @@ -8,6 +8,7 @@ from flask import Flask, Response, jsonify, send_from_directory from prometheus_client import generate_latest, CONTENT_TYPE_LATEST from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR +from homecouk import load_cookies as load_homecouk_cookies from rightmove import outcode_cache from scraper import ( _sync_gauges, @@ -122,6 +123,10 @@ def get_status(): "outcodes_total": status.outcodes_total, "properties_buy": status.properties_buy, "properties_rent": status.properties_rent, + "properties_by_source": { + "rightmove": status.rm_properties, + "homecouk": status.hk_properties, + }, "errors": status.errors[-20:], # last 20 errors "elapsed_seconds": round(elapsed, 1), } @@ -132,9 +137,11 @@ def get_status(): @app.route("/debug") def get_debug(): + hk_cookies = load_homecouk_cookies() return jsonify({ "outcode_cache_size": len(outcode_cache), "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), + "homecouk_enabled": hk_cookies is not None, }) diff --git a/finder/metrics.py b/finder/metrics.py index c43d943..dae9425 100644 --- a/finder/metrics.py +++ b/finder/metrics.py @@ -23,7 +23,7 @@ scrape_outcodes_total = Gauge( scrape_properties_total = Gauge( "scrape_properties_total", "Properties found so far", - ["channel"], + ["channel", "source"], ) scrape_elapsed_seconds = Gauge( @@ -32,18 +32,18 @@ scrape_elapsed_seconds = Gauge( ) # --------------------------------------------------------------------------- -# Counters — monotonically increasing +# Counters — Rightmove (monotonically increasing) # --------------------------------------------------------------------------- http_requests_total = Counter( "http_requests_total", - "HTTP requests made by the scraper", + "HTTP requests made to Rightmove", ["status", "endpoint"], ) http_errors_total = Counter( "http_errors_total", - "HTTP connection/timeout errors", + "Rightmove HTTP connection/timeout errors", ["type"], ) @@ -56,4 +56,58 @@ ip_rotations_total = Counter( scrape_errors_total = Counter( "scrape_errors_total", "Per-outcode scrape errors", + ["source"], +) + +# --------------------------------------------------------------------------- +# Counters — home.co.uk +# --------------------------------------------------------------------------- + +homecouk_requests_total = Counter( + "homecouk_requests_total", + "HTTP requests made to home.co.uk API", + ["status"], +) + +homecouk_errors_total = Counter( + "homecouk_errors_total", + "home.co.uk HTTP connection/timeout errors", + ["type"], +) + +homecouk_properties_scraped = Counter( + "homecouk_properties_scraped", + "Properties scraped from home.co.uk (before dedup)", + ["channel"], +) + +cross_source_dedup_total = Counter( + "cross_source_dedup_total", + "home.co.uk properties skipped because same property already found on Rightmove", + ["channel"], +) + +# --------------------------------------------------------------------------- +# Counters — FlareSolverr / cookie management +# --------------------------------------------------------------------------- + +flaresolverr_attempts_total = Counter( + "flaresolverr_attempts_total", + "FlareSolverr Cloudflare challenge-solving attempts", + ["result"], +) + +cookie_refreshes_total = Counter( + "cookie_refreshes_total", + "home.co.uk cookie refresh attempts (triggered by 403)", + ["result"], +) + +# --------------------------------------------------------------------------- +# Gauges — home.co.uk state +# --------------------------------------------------------------------------- + +homecouk_enabled = Gauge( + "homecouk_enabled", + "Whether home.co.uk scraping is currently active (1=yes, 0=no)", ) diff --git a/finder/scraper.py b/finder/scraper.py index 1e3f00d..f4bd151 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -7,8 +7,15 @@ from dataclasses import dataclass, field import polars as pl from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED +from homecouk import CookiesExpiredError +from homecouk import load_cookies as load_homecouk_cookies +from homecouk import make_client as make_homecouk_client +from homecouk import search_outcode as homecouk_search_outcode from http_client import make_client from metrics import ( + cookie_refreshes_total, + cross_source_dedup_total, + homecouk_enabled, scrape_elapsed_seconds, scrape_errors_total, scrape_outcodes_done, @@ -32,6 +39,9 @@ class ScrapeStatus: outcodes_total: int = 0 properties_buy: int = 0 properties_rent: int = 0 + # Per-source counts for current channel + rm_properties: int = 0 + hk_properties: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 @@ -47,8 +57,13 @@ def _sync_gauges() -> None: scrape_state.labels(state=state).set(1 if status.state == state else 0) scrape_outcodes_done.set(status.outcodes_done) scrape_outcodes_total.set(status.outcodes_total) - scrape_properties_total.labels(channel="buy").set(status.properties_buy) - scrape_properties_total.labels(channel="rent").set(status.properties_rent) + # Total properties (both sources combined) + scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy) + scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent) + # Per-source breakdown for current channel + ch = "buy" if status.channel == "BUY" else "rent" + scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties) + scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties) if status.started_at: end = status.finished_at if status.finished_at else time.time() scrape_elapsed_seconds.set(end - status.started_at) @@ -87,8 +102,16 @@ def build_postcode_index() -> PostcodeSpatialIndex: ) +def _dedup_key(p: dict) -> tuple: + """Composite key for cross-source deduplication: (postcode, bedrooms, price). + Two listings on different portals for the same physical property will share + these attributes even though their IDs differ.""" + return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0)) + + def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: - """Main scrape loop — runs in background thread.""" + """Main scrape loop — runs in background thread. + Scrapes Rightmove and (if configured) home.co.uk, merging into one dataset.""" global status with status_lock: status.state = "running" @@ -105,16 +128,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: client = make_client() + # home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars) + hk_result = load_homecouk_cookies() + hk_client = make_homecouk_client(*hk_result) if hk_result else None + if hk_client: + log.info("home.co.uk scraping ENABLED") + homecouk_enabled.set(1) + else: + log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)") + homecouk_enabled.set(0) + hk_failed = False # set to True on 403 to skip remaining outcodes + try: for channel_cfg in CHANNELS: channel_name = channel_cfg["channel"] file_suffix = "buy" if channel_name == "BUY" else "rent" - all_properties: dict[int, dict] = {} # dedup by id + all_properties: dict[str, dict] = {} # dedup by id + seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price) + rm_count = 0 # Rightmove properties this channel + hk_count = 0 # home.co.uk properties this channel + hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates with status_lock: status.channel = channel_name status.outcodes_done = 0 status.outcodes_total = len(shuffled) + status.rm_properties = 0 + status.hk_properties = 0 log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)) @@ -126,34 +166,81 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: log.debug("Outcode %s (%d/%d) — %d properties so far", outcode, i + 1, len(shuffled), len(all_properties)) + # --- Rightmove --- try: outcode_id = resolve_outcode_id(client, outcode) if not outcode_id: log.debug("No Rightmove ID for outcode %s, skipping", outcode) - continue - - props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index) - for p in props: - pid = p["id"] - if pid not in all_properties: - all_properties[pid] = p - - with status_lock: - if channel_name == "BUY": - status.properties_buy = len(all_properties) - else: - status.properties_rent = len(all_properties) - _sync_gauges() - - log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties)) - + else: + props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index) + for p in props: + pid = p["id"] + if pid not in all_properties: + all_properties[pid] = p + seen_dedup_keys.add(_dedup_key(p)) + rm_count += 1 except Exception as e: - msg = f"Error scraping {outcode}/{channel_name}: {e}" + msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}" log.error(msg) - scrape_errors_total.inc() + scrape_errors_total.labels(source="rightmove").inc() with status_lock: status.errors.append(msg) + # --- home.co.uk --- + if hk_client and not hk_failed: + try: + hk_props = homecouk_search_outcode( + hk_client, outcode, channel_name, pc_index, + ) + for p in hk_props: + pid = p["id"] + key = _dedup_key(p) + if pid in all_properties or key in seen_dedup_keys: + hk_dedup_count += 1 + cross_source_dedup_total.labels( + channel="buy" if channel_name == "BUY" else "rent", + ).inc() + continue + all_properties[pid] = p + seen_dedup_keys.add(key) + hk_count += 1 + if hk_props: + log.info("home.co.uk %s: +%d properties", outcode, len(hk_props)) + except CookiesExpiredError: + log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr") + hk_client.close() + hk_result = load_homecouk_cookies() + if hk_result: + hk_client = make_homecouk_client(*hk_result) + log.info("home.co.uk cookies refreshed, continuing") + cookie_refreshes_total.labels(result="success").inc() + else: + log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape") + hk_client = None + hk_failed = True + homecouk_enabled.set(0) + cookie_refreshes_total.labels(result="failure").inc() + with status_lock: + status.errors.append("home.co.uk cookies expired and refresh failed") + except Exception as e: + msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}" + log.error(msg) + scrape_errors_total.labels(source="homecouk").inc() + with status_lock: + status.errors.append(msg) + + with status_lock: + if channel_name == "BUY": + status.properties_buy = len(all_properties) + else: + status.properties_rent = len(all_properties) + status.rm_properties = rm_count + status.hk_properties = hk_count + _sync_gauges() + + log.info("Outcode %s: total %d (rm: %d, hk: %d)", + outcode, len(all_properties), rm_count, hk_count) + if i < len(shuffled) - 1: time.sleep(DELAY_BETWEEN_OUTCODES) @@ -170,7 +257,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: status.outcodes_done = len(shuffled) _sync_gauges() - log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped)) + log.info("=== %s channel complete: %d unique (rm: %d, hk: %d, cross-dedup: %d) ===", + channel_name, len(deduped), rm_count, hk_count, hk_dedup_count) with status_lock: status.state = "done" @@ -189,3 +277,5 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: _sync_gauges() finally: client.close() + if hk_client: + hk_client.close()