home.co.uk scraping
This commit is contained in:
parent
74d6dd7bf8
commit
f3e3c1ee49
6 changed files with 538 additions and 28 deletions
|
|
@ -136,6 +136,17 @@ services:
|
||||||
# devices:
|
# devices:
|
||||||
# - /dev/net/tun:/dev/net/tun
|
# - /dev/net/tun:/dev/net/tun
|
||||||
|
|
||||||
|
# flaresolverr:
|
||||||
|
# image: ghcr.io/flaresolverr/flaresolverr:latest
|
||||||
|
# environment:
|
||||||
|
# LOG_LEVEL: info
|
||||||
|
# TZ: Europe/London
|
||||||
|
# ports:
|
||||||
|
# - "8191:8191"
|
||||||
|
# networks:
|
||||||
|
# - dev-network
|
||||||
|
# restart: unless-stopped
|
||||||
|
|
||||||
# finder:
|
# finder:
|
||||||
# build:
|
# build:
|
||||||
# context: .
|
# context: .
|
||||||
|
|
@ -144,9 +155,13 @@ services:
|
||||||
# network_mode: service:gluetun
|
# network_mode: service:gluetun
|
||||||
# volumes:
|
# volumes:
|
||||||
# - ./finder:/app
|
# - ./finder:/app
|
||||||
|
# environment:
|
||||||
|
# FLARESOLVERR_URL: http://flaresolverr:8191
|
||||||
# depends_on:
|
# depends_on:
|
||||||
# gluetun:
|
# gluetun:
|
||||||
# condition: service_healthy
|
# condition: service_healthy
|
||||||
|
# flaresolverr:
|
||||||
|
# condition: service_started
|
||||||
# restart: unless-stopped
|
# restart: unless-stopped
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,11 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||||
|
|
||||||
|
# home.co.uk
|
||||||
|
HOMECOUK_BASE = "https://home.co.uk"
|
||||||
|
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
|
||||||
|
HOMECOUK_PER_PAGE = 30 # max supported by the API
|
||||||
|
|
||||||
PROPERTY_TYPE_MAP = {
|
PROPERTY_TYPE_MAP = {
|
||||||
"Detached": "Detached",
|
"Detached": "Detached",
|
||||||
"Semi-Detached": "Semi-Detached",
|
"Semi-Detached": "Semi-Detached",
|
||||||
|
|
|
||||||
339
finder/homecouk.py
Normal file
339
finder/homecouk.py
Normal file
|
|
@ -0,0 +1,339 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from constants import (
|
||||||
|
DELAY_BETWEEN_PAGES,
|
||||||
|
HOMECOUK_API_BASE,
|
||||||
|
HOMECOUK_BASE,
|
||||||
|
HOMECOUK_PER_PAGE,
|
||||||
|
PROPERTY_TYPE_MAP,
|
||||||
|
RETRY_BASE_DELAY,
|
||||||
|
)
|
||||||
|
from metrics import (
|
||||||
|
flaresolverr_attempts_total,
|
||||||
|
homecouk_errors_total,
|
||||||
|
homecouk_properties_scraped,
|
||||||
|
homecouk_requests_total,
|
||||||
|
)
|
||||||
|
from spatial import PostcodeSpatialIndex
|
||||||
|
|
||||||
|
log = logging.getLogger("homecouk")
|
||||||
|
|
||||||
|
|
||||||
|
class CookiesExpiredError(Exception):
|
||||||
|
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
|
||||||
|
|
||||||
|
|
||||||
|
# Channel mapping: internal name → URL path segment
|
||||||
|
HOMECOUK_CHANNELS = {
|
||||||
|
"BUY": "for-sale",
|
||||||
|
"RENT": "to-rent",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://flaresolverr:8191")
|
||||||
|
|
||||||
|
|
||||||
|
def solve_cloudflare() -> tuple[dict[str, str], str] | None:
|
||||||
|
"""Use FlareSolverr to solve the Cloudflare challenge.
|
||||||
|
Returns (cookies_dict, user_agent) or None on failure."""
|
||||||
|
log.info("Solving Cloudflare challenge via FlareSolverr at %s", FLARESOLVERR_URL)
|
||||||
|
try:
|
||||||
|
with httpx.Client(timeout=120) as client:
|
||||||
|
resp = client.post(
|
||||||
|
f"{FLARESOLVERR_URL}/v1",
|
||||||
|
json={
|
||||||
|
"cmd": "request.get",
|
||||||
|
"url": f"{HOMECOUK_BASE}/for-sale/e1/",
|
||||||
|
"maxTimeout": 60000,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
log.error("FlareSolverr returned HTTP %d", resp.status_code)
|
||||||
|
return None
|
||||||
|
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("status") != "ok":
|
||||||
|
log.error("FlareSolverr error: %s", data.get("message", "unknown"))
|
||||||
|
return None
|
||||||
|
|
||||||
|
solution = data["solution"]
|
||||||
|
raw_cookies = solution.get("cookies", [])
|
||||||
|
user_agent = solution.get("userAgent", "")
|
||||||
|
|
||||||
|
cookies = {}
|
||||||
|
for c in raw_cookies:
|
||||||
|
name = c.get("name", "")
|
||||||
|
if name in ("cf_clearance", "homecouk_session", "XSRF-TOKEN"):
|
||||||
|
cookies[name] = c["value"]
|
||||||
|
|
||||||
|
if "cf_clearance" not in cookies:
|
||||||
|
log.error("FlareSolverr solved but no cf_clearance cookie returned")
|
||||||
|
flaresolverr_attempts_total.labels(result="no_cf_clearance").inc()
|
||||||
|
return None
|
||||||
|
if "homecouk_session" not in cookies:
|
||||||
|
log.error("FlareSolverr solved but no homecouk_session cookie returned")
|
||||||
|
flaresolverr_attempts_total.labels(result="no_session").inc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Cloudflare solved — got %d cookies, UA: %s",
|
||||||
|
len(cookies), user_agent[:60],
|
||||||
|
)
|
||||||
|
flaresolverr_attempts_total.labels(result="success").inc()
|
||||||
|
return cookies, user_agent
|
||||||
|
|
||||||
|
except (httpx.ConnectError, httpx.ReadTimeout) as e:
|
||||||
|
log.warning("FlareSolverr not available: %s", e)
|
||||||
|
flaresolverr_attempts_total.labels(result="unavailable").inc()
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
log.error("FlareSolverr error: %s", e)
|
||||||
|
flaresolverr_attempts_total.labels(result="error").inc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||||
|
"""Get home.co.uk cookies + user-agent.
|
||||||
|
Tries FlareSolverr first, then falls back to environment variables.
|
||||||
|
Returns (cookies_dict, user_agent) or None if not configured."""
|
||||||
|
# Try FlareSolverr first
|
||||||
|
result = solve_cloudflare()
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Fall back to env vars
|
||||||
|
cf_clearance = os.environ.get("HOMECOUK_CF_CLEARANCE", "")
|
||||||
|
session = os.environ.get("HOMECOUK_SESSION", "")
|
||||||
|
if not cf_clearance or not session:
|
||||||
|
return None
|
||||||
|
user_agent = os.environ.get(
|
||||||
|
"HOMECOUK_USER_AGENT",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/145.0.0.0 Safari/537.36",
|
||||||
|
)
|
||||||
|
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
|
||||||
|
|
||||||
|
|
||||||
|
def make_client(cookies: dict[str, str], user_agent: str) -> httpx.Client:
|
||||||
|
"""Create an httpx Client configured for home.co.uk API calls.
|
||||||
|
user_agent must match the one used when obtaining cf_clearance."""
|
||||||
|
return httpx.Client(
|
||||||
|
timeout=30,
|
||||||
|
cookies=cookies,
|
||||||
|
headers={
|
||||||
|
"User-Agent": user_agent,
|
||||||
|
"Accept": "application/json, text/plain, */*",
|
||||||
|
"x-requested-with": "XMLHttpRequest",
|
||||||
|
},
|
||||||
|
follow_redirects=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _status_label(code: int) -> str:
|
||||||
|
if code >= 500:
|
||||||
|
return "5xx"
|
||||||
|
return str(code)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(
|
||||||
|
client: httpx.Client, url: str, params: dict, max_retries: int = 3
|
||||||
|
) -> dict | None:
|
||||||
|
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
|
||||||
|
403 means cookies expired — raises CookiesExpiredError immediately."""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
resp = client.get(url, params=params)
|
||||||
|
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return resp.json()
|
||||||
|
if resp.status_code == 403:
|
||||||
|
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
||||||
|
if resp.status_code in (429, 500, 502, 503, 504):
|
||||||
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||||
|
log.warning(
|
||||||
|
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||||
|
resp.status_code, url, attempt + 1, max_retries, delay,
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
|
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||||
|
return None
|
||||||
|
except CookiesExpiredError:
|
||||||
|
raise
|
||||||
|
except (
|
||||||
|
httpx.ConnectError, httpx.ReadTimeout,
|
||||||
|
httpx.WriteTimeout, httpx.PoolTimeout,
|
||||||
|
) as e:
|
||||||
|
homecouk_errors_total.labels(type=type(e).__name__).inc()
|
||||||
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||||
|
log.warning(
|
||||||
|
"%s from %s, retry %d/%d in %.1fs",
|
||||||
|
type(e).__name__, url, attempt + 1, max_retries, delay,
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
homecouk_errors_total.labels(type="retry_exhausted").inc()
|
||||||
|
log.error("All %d retries exhausted for %s", max_retries, url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_floor_area(description: str | None) -> float | None:
|
||||||
|
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
|
||||||
|
if not description:
|
||||||
|
return None
|
||||||
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
sqft = float(m.group(1).replace(",", ""))
|
||||||
|
return round(sqft * 0.092903, 1)
|
||||||
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return round(float(m.group(1).replace(",", "")), 1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def map_property_type(raw_type: str | None) -> str:
|
||||||
|
"""Map home.co.uk property type to canonical type."""
|
||||||
|
if not raw_type:
|
||||||
|
return "Other"
|
||||||
|
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
|
||||||
|
# Try common patterns
|
||||||
|
lower = raw_type.lower()
|
||||||
|
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||||
|
return "Flats/Maisonettes"
|
||||||
|
if "detached" in lower and "semi" not in lower:
|
||||||
|
return "Detached"
|
||||||
|
if "semi" in lower:
|
||||||
|
return "Semi-Detached"
|
||||||
|
if "terrace" in lower or "mews" in lower:
|
||||||
|
return "Terraced"
|
||||||
|
log.debug("Unknown property type: %r — mapping to Other", raw_type)
|
||||||
|
return "Other"
|
||||||
|
|
||||||
|
|
||||||
|
def transform_property(
|
||||||
|
prop: dict, channel: str, pc_index: PostcodeSpatialIndex,
|
||||||
|
) -> dict | None:
|
||||||
|
"""Transform a raw home.co.uk property dict into our output schema."""
|
||||||
|
lat = prop.get("latitude")
|
||||||
|
lng = prop.get("longitude")
|
||||||
|
if lat is None or lng is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Validate coordinates are in England
|
||||||
|
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||||
|
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||||
|
return None
|
||||||
|
|
||||||
|
price = prop.get("price") or prop.get("latest_price")
|
||||||
|
if not price:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||||
|
postcode = prop.get("postcode")
|
||||||
|
if not postcode:
|
||||||
|
postcode = pc_index.nearest(lat, lng)
|
||||||
|
if not postcode:
|
||||||
|
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||||
|
return None
|
||||||
|
|
||||||
|
bedrooms = prop.get("bedrooms", 0) or 0
|
||||||
|
bathrooms = prop.get("bathrooms", 0) or 0
|
||||||
|
|
||||||
|
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
||||||
|
address = prop.get("display_address") or prop.get("address") or ""
|
||||||
|
|
||||||
|
# Derive price qualifier from reduction info
|
||||||
|
price_qualifier = ""
|
||||||
|
if prop.get("is_reduced"):
|
||||||
|
pct = prop.get("reduction_percent", 0)
|
||||||
|
if pct:
|
||||||
|
price_qualifier = f"Reduced by {pct}%"
|
||||||
|
else:
|
||||||
|
price_qualifier = "Reduced"
|
||||||
|
|
||||||
|
listing_id = prop.get("listing_id") or prop.get("property_id") or ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs
|
||||||
|
"Bedrooms": bedrooms,
|
||||||
|
"Bathrooms": bathrooms,
|
||||||
|
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||||
|
"lon": lng,
|
||||||
|
"lat": lat,
|
||||||
|
"Postcode": postcode,
|
||||||
|
"Address per Property Register": address,
|
||||||
|
"Leashold/Freehold": None, # not available from home.co.uk
|
||||||
|
"Property type": map_property_type(listing_type),
|
||||||
|
"Property sub-type": listing_type or "Unknown",
|
||||||
|
"price": int(price),
|
||||||
|
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||||
|
"Price qualifier": price_qualifier,
|
||||||
|
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
|
||||||
|
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
|
||||||
|
"Listing features": [], # not available from home.co.uk
|
||||||
|
"first_visible_date": prop.get("added_date") or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def search_outcode(
|
||||||
|
client: httpx.Client,
|
||||||
|
outcode: str,
|
||||||
|
channel: str,
|
||||||
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Paginate through search results for one outcode+channel.
|
||||||
|
channel: "BUY" or "RENT".
|
||||||
|
Returns transformed properties."""
|
||||||
|
url_segment = HOMECOUK_CHANNELS[channel]
|
||||||
|
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
|
||||||
|
properties = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
params = {
|
||||||
|
"page": str(page),
|
||||||
|
"sort": "date_desc",
|
||||||
|
"per_page": str(HOMECOUK_PER_PAGE),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set referer to match the page URL pattern
|
||||||
|
client.headers["referer"] = (
|
||||||
|
f"https://home.co.uk/{url_segment}/{outcode.lower()}/"
|
||||||
|
f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}"
|
||||||
|
)
|
||||||
|
|
||||||
|
data = fetch_page(client, url, params)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
|
||||||
|
raw_props = data.get("properties", [])
|
||||||
|
if not raw_props:
|
||||||
|
break
|
||||||
|
|
||||||
|
for prop in raw_props:
|
||||||
|
transformed = transform_property(prop, channel, pc_index)
|
||||||
|
if transformed:
|
||||||
|
properties.append(transformed)
|
||||||
|
homecouk_properties_scraped.labels(
|
||||||
|
channel="buy" if channel == "BUY" else "rent",
|
||||||
|
).inc()
|
||||||
|
|
||||||
|
# Check pagination
|
||||||
|
pagination = data.get("pagination", {})
|
||||||
|
last_page = pagination.get("last_page", 1)
|
||||||
|
if page >= last_page:
|
||||||
|
break
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
|
||||||
|
return properties
|
||||||
|
|
@ -8,6 +8,7 @@ from flask import Flask, Response, jsonify, send_from_directory
|
||||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||||
|
|
||||||
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
|
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
|
||||||
|
from homecouk import load_cookies as load_homecouk_cookies
|
||||||
from rightmove import outcode_cache
|
from rightmove import outcode_cache
|
||||||
from scraper import (
|
from scraper import (
|
||||||
_sync_gauges,
|
_sync_gauges,
|
||||||
|
|
@ -122,6 +123,10 @@ def get_status():
|
||||||
"outcodes_total": status.outcodes_total,
|
"outcodes_total": status.outcodes_total,
|
||||||
"properties_buy": status.properties_buy,
|
"properties_buy": status.properties_buy,
|
||||||
"properties_rent": status.properties_rent,
|
"properties_rent": status.properties_rent,
|
||||||
|
"properties_by_source": {
|
||||||
|
"rightmove": status.rm_properties,
|
||||||
|
"homecouk": status.hk_properties,
|
||||||
|
},
|
||||||
"errors": status.errors[-20:], # last 20 errors
|
"errors": status.errors[-20:], # last 20 errors
|
||||||
"elapsed_seconds": round(elapsed, 1),
|
"elapsed_seconds": round(elapsed, 1),
|
||||||
}
|
}
|
||||||
|
|
@ -132,9 +137,11 @@ def get_status():
|
||||||
|
|
||||||
@app.route("/debug")
|
@app.route("/debug")
|
||||||
def get_debug():
|
def get_debug():
|
||||||
|
hk_cookies = load_homecouk_cookies()
|
||||||
return jsonify({
|
return jsonify({
|
||||||
"outcode_cache_size": len(outcode_cache),
|
"outcode_cache_size": len(outcode_cache),
|
||||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||||
|
"homecouk_enabled": hk_cookies is not None,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ scrape_outcodes_total = Gauge(
|
||||||
scrape_properties_total = Gauge(
|
scrape_properties_total = Gauge(
|
||||||
"scrape_properties_total",
|
"scrape_properties_total",
|
||||||
"Properties found so far",
|
"Properties found so far",
|
||||||
["channel"],
|
["channel", "source"],
|
||||||
)
|
)
|
||||||
|
|
||||||
scrape_elapsed_seconds = Gauge(
|
scrape_elapsed_seconds = Gauge(
|
||||||
|
|
@ -32,18 +32,18 @@ scrape_elapsed_seconds = Gauge(
|
||||||
)
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Counters — monotonically increasing
|
# Counters — Rightmove (monotonically increasing)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
http_requests_total = Counter(
|
http_requests_total = Counter(
|
||||||
"http_requests_total",
|
"http_requests_total",
|
||||||
"HTTP requests made by the scraper",
|
"HTTP requests made to Rightmove",
|
||||||
["status", "endpoint"],
|
["status", "endpoint"],
|
||||||
)
|
)
|
||||||
|
|
||||||
http_errors_total = Counter(
|
http_errors_total = Counter(
|
||||||
"http_errors_total",
|
"http_errors_total",
|
||||||
"HTTP connection/timeout errors",
|
"Rightmove HTTP connection/timeout errors",
|
||||||
["type"],
|
["type"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -56,4 +56,58 @@ ip_rotations_total = Counter(
|
||||||
scrape_errors_total = Counter(
|
scrape_errors_total = Counter(
|
||||||
"scrape_errors_total",
|
"scrape_errors_total",
|
||||||
"Per-outcode scrape errors",
|
"Per-outcode scrape errors",
|
||||||
|
["source"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Counters — home.co.uk
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
homecouk_requests_total = Counter(
|
||||||
|
"homecouk_requests_total",
|
||||||
|
"HTTP requests made to home.co.uk API",
|
||||||
|
["status"],
|
||||||
|
)
|
||||||
|
|
||||||
|
homecouk_errors_total = Counter(
|
||||||
|
"homecouk_errors_total",
|
||||||
|
"home.co.uk HTTP connection/timeout errors",
|
||||||
|
["type"],
|
||||||
|
)
|
||||||
|
|
||||||
|
homecouk_properties_scraped = Counter(
|
||||||
|
"homecouk_properties_scraped",
|
||||||
|
"Properties scraped from home.co.uk (before dedup)",
|
||||||
|
["channel"],
|
||||||
|
)
|
||||||
|
|
||||||
|
cross_source_dedup_total = Counter(
|
||||||
|
"cross_source_dedup_total",
|
||||||
|
"home.co.uk properties skipped because same property already found on Rightmove",
|
||||||
|
["channel"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Counters — FlareSolverr / cookie management
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
flaresolverr_attempts_total = Counter(
|
||||||
|
"flaresolverr_attempts_total",
|
||||||
|
"FlareSolverr Cloudflare challenge-solving attempts",
|
||||||
|
["result"],
|
||||||
|
)
|
||||||
|
|
||||||
|
cookie_refreshes_total = Counter(
|
||||||
|
"cookie_refreshes_total",
|
||||||
|
"home.co.uk cookie refresh attempts (triggered by 403)",
|
||||||
|
["result"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Gauges — home.co.uk state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
homecouk_enabled = Gauge(
|
||||||
|
"homecouk_enabled",
|
||||||
|
"Whether home.co.uk scraping is currently active (1=yes, 0=no)",
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,15 @@ from dataclasses import dataclass, field
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||||
|
from homecouk import CookiesExpiredError
|
||||||
|
from homecouk import load_cookies as load_homecouk_cookies
|
||||||
|
from homecouk import make_client as make_homecouk_client
|
||||||
|
from homecouk import search_outcode as homecouk_search_outcode
|
||||||
from http_client import make_client
|
from http_client import make_client
|
||||||
from metrics import (
|
from metrics import (
|
||||||
|
cookie_refreshes_total,
|
||||||
|
cross_source_dedup_total,
|
||||||
|
homecouk_enabled,
|
||||||
scrape_elapsed_seconds,
|
scrape_elapsed_seconds,
|
||||||
scrape_errors_total,
|
scrape_errors_total,
|
||||||
scrape_outcodes_done,
|
scrape_outcodes_done,
|
||||||
|
|
@ -32,6 +39,9 @@ class ScrapeStatus:
|
||||||
outcodes_total: int = 0
|
outcodes_total: int = 0
|
||||||
properties_buy: int = 0
|
properties_buy: int = 0
|
||||||
properties_rent: int = 0
|
properties_rent: int = 0
|
||||||
|
# Per-source counts for current channel
|
||||||
|
rm_properties: int = 0
|
||||||
|
hk_properties: int = 0
|
||||||
errors: list[str] = field(default_factory=list)
|
errors: list[str] = field(default_factory=list)
|
||||||
started_at: float = 0.0
|
started_at: float = 0.0
|
||||||
finished_at: float = 0.0
|
finished_at: float = 0.0
|
||||||
|
|
@ -47,8 +57,13 @@ def _sync_gauges() -> None:
|
||||||
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
||||||
scrape_outcodes_done.set(status.outcodes_done)
|
scrape_outcodes_done.set(status.outcodes_done)
|
||||||
scrape_outcodes_total.set(status.outcodes_total)
|
scrape_outcodes_total.set(status.outcodes_total)
|
||||||
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
|
# Total properties (both sources combined)
|
||||||
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
|
scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy)
|
||||||
|
scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent)
|
||||||
|
# Per-source breakdown for current channel
|
||||||
|
ch = "buy" if status.channel == "BUY" else "rent"
|
||||||
|
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
|
||||||
|
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
|
||||||
if status.started_at:
|
if status.started_at:
|
||||||
end = status.finished_at if status.finished_at else time.time()
|
end = status.finished_at if status.finished_at else time.time()
|
||||||
scrape_elapsed_seconds.set(end - status.started_at)
|
scrape_elapsed_seconds.set(end - status.started_at)
|
||||||
|
|
@ -87,8 +102,16 @@ def build_postcode_index() -> PostcodeSpatialIndex:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _dedup_key(p: dict) -> tuple:
|
||||||
|
"""Composite key for cross-source deduplication: (postcode, bedrooms, price).
|
||||||
|
Two listings on different portals for the same physical property will share
|
||||||
|
these attributes even though their IDs differ."""
|
||||||
|
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
|
||||||
|
|
||||||
|
|
||||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||||
"""Main scrape loop — runs in background thread."""
|
"""Main scrape loop — runs in background thread.
|
||||||
|
Scrapes Rightmove and (if configured) home.co.uk, merging into one dataset."""
|
||||||
global status
|
global status
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "running"
|
status.state = "running"
|
||||||
|
|
@ -105,16 +128,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||||
|
|
||||||
client = make_client()
|
client = make_client()
|
||||||
|
|
||||||
|
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
|
||||||
|
hk_result = load_homecouk_cookies()
|
||||||
|
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||||
|
if hk_client:
|
||||||
|
log.info("home.co.uk scraping ENABLED")
|
||||||
|
homecouk_enabled.set(1)
|
||||||
|
else:
|
||||||
|
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||||
|
homecouk_enabled.set(0)
|
||||||
|
hk_failed = False # set to True on 403 to skip remaining outcodes
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for channel_cfg in CHANNELS:
|
for channel_cfg in CHANNELS:
|
||||||
channel_name = channel_cfg["channel"]
|
channel_name = channel_cfg["channel"]
|
||||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||||
all_properties: dict[int, dict] = {} # dedup by id
|
all_properties: dict[str, dict] = {} # dedup by id
|
||||||
|
seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price)
|
||||||
|
rm_count = 0 # Rightmove properties this channel
|
||||||
|
hk_count = 0 # home.co.uk properties this channel
|
||||||
|
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||||
|
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.channel = channel_name
|
status.channel = channel_name
|
||||||
status.outcodes_done = 0
|
status.outcodes_done = 0
|
||||||
status.outcodes_total = len(shuffled)
|
status.outcodes_total = len(shuffled)
|
||||||
|
status.rm_properties = 0
|
||||||
|
status.hk_properties = 0
|
||||||
|
|
||||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||||
|
|
||||||
|
|
@ -126,34 +166,81 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||||
outcode, i + 1, len(shuffled), len(all_properties))
|
outcode, i + 1, len(shuffled), len(all_properties))
|
||||||
|
|
||||||
|
# --- Rightmove ---
|
||||||
try:
|
try:
|
||||||
outcode_id = resolve_outcode_id(client, outcode)
|
outcode_id = resolve_outcode_id(client, outcode)
|
||||||
if not outcode_id:
|
if not outcode_id:
|
||||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||||
continue
|
else:
|
||||||
|
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
for p in props:
|
||||||
for p in props:
|
pid = p["id"]
|
||||||
pid = p["id"]
|
if pid not in all_properties:
|
||||||
if pid not in all_properties:
|
all_properties[pid] = p
|
||||||
all_properties[pid] = p
|
seen_dedup_keys.add(_dedup_key(p))
|
||||||
|
rm_count += 1
|
||||||
with status_lock:
|
|
||||||
if channel_name == "BUY":
|
|
||||||
status.properties_buy = len(all_properties)
|
|
||||||
else:
|
|
||||||
status.properties_rent = len(all_properties)
|
|
||||||
_sync_gauges()
|
|
||||||
|
|
||||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||||
log.error(msg)
|
log.error(msg)
|
||||||
scrape_errors_total.inc()
|
scrape_errors_total.labels(source="rightmove").inc()
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.errors.append(msg)
|
status.errors.append(msg)
|
||||||
|
|
||||||
|
# --- home.co.uk ---
|
||||||
|
if hk_client and not hk_failed:
|
||||||
|
try:
|
||||||
|
hk_props = homecouk_search_outcode(
|
||||||
|
hk_client, outcode, channel_name, pc_index,
|
||||||
|
)
|
||||||
|
for p in hk_props:
|
||||||
|
pid = p["id"]
|
||||||
|
key = _dedup_key(p)
|
||||||
|
if pid in all_properties or key in seen_dedup_keys:
|
||||||
|
hk_dedup_count += 1
|
||||||
|
cross_source_dedup_total.labels(
|
||||||
|
channel="buy" if channel_name == "BUY" else "rent",
|
||||||
|
).inc()
|
||||||
|
continue
|
||||||
|
all_properties[pid] = p
|
||||||
|
seen_dedup_keys.add(key)
|
||||||
|
hk_count += 1
|
||||||
|
if hk_props:
|
||||||
|
log.info("home.co.uk %s: +%d properties", outcode, len(hk_props))
|
||||||
|
except CookiesExpiredError:
|
||||||
|
log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr")
|
||||||
|
hk_client.close()
|
||||||
|
hk_result = load_homecouk_cookies()
|
||||||
|
if hk_result:
|
||||||
|
hk_client = make_homecouk_client(*hk_result)
|
||||||
|
log.info("home.co.uk cookies refreshed, continuing")
|
||||||
|
cookie_refreshes_total.labels(result="success").inc()
|
||||||
|
else:
|
||||||
|
log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape")
|
||||||
|
hk_client = None
|
||||||
|
hk_failed = True
|
||||||
|
homecouk_enabled.set(0)
|
||||||
|
cookie_refreshes_total.labels(result="failure").inc()
|
||||||
|
with status_lock:
|
||||||
|
status.errors.append("home.co.uk cookies expired and refresh failed")
|
||||||
|
except Exception as e:
|
||||||
|
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
|
||||||
|
log.error(msg)
|
||||||
|
scrape_errors_total.labels(source="homecouk").inc()
|
||||||
|
with status_lock:
|
||||||
|
status.errors.append(msg)
|
||||||
|
|
||||||
|
with status_lock:
|
||||||
|
if channel_name == "BUY":
|
||||||
|
status.properties_buy = len(all_properties)
|
||||||
|
else:
|
||||||
|
status.properties_rent = len(all_properties)
|
||||||
|
status.rm_properties = rm_count
|
||||||
|
status.hk_properties = hk_count
|
||||||
|
_sync_gauges()
|
||||||
|
|
||||||
|
log.info("Outcode %s: total %d (rm: %d, hk: %d)",
|
||||||
|
outcode, len(all_properties), rm_count, hk_count)
|
||||||
|
|
||||||
if i < len(shuffled) - 1:
|
if i < len(shuffled) - 1:
|
||||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
|
|
||||||
|
|
@ -170,7 +257,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||||
status.outcodes_done = len(shuffled)
|
status.outcodes_done = len(shuffled)
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
|
|
||||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
log.info("=== %s channel complete: %d unique (rm: %d, hk: %d, cross-dedup: %d) ===",
|
||||||
|
channel_name, len(deduped), rm_count, hk_count, hk_dedup_count)
|
||||||
|
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "done"
|
status.state = "done"
|
||||||
|
|
@ -189,3 +277,5 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
finally:
|
finally:
|
||||||
client.close()
|
client.close()
|
||||||
|
if hk_client:
|
||||||
|
hk_client.close()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue