Updates
Some checks failed
CI / Frontend (lint + typecheck) (push) Failing after 3m45s
CI / Rust (lint + test) (push) Failing after 5m15s
CI / Python (lint + test) (push) Failing after 5m17s
Build and publish Docker image / build-and-push (push) Failing after 7m15s

This commit is contained in:
Andras Schmelczer 2026-03-28 12:00:15 +00:00
parent 7591e5fc05
commit 89a85e9a0c
22 changed files with 1006 additions and 899 deletions

View file

@ -55,6 +55,7 @@ RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
HOMECOUK_BASE = "https://home.co.uk"
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
HOMECOUK_PER_PAGE = 30 # max supported by the API
HOMECOUK_CONCURRENCY = int(os.environ.get("HOMECOUK_CONCURRENCY", "4"))
# OpenRent
OPENRENT_BASE = "https://www.openrent.co.uk"

View file

@ -3,6 +3,7 @@ import logging
import random
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
import polars as pl
@ -15,6 +16,7 @@ from constants import (
CHECKPOINT_INTERVAL,
DATA_DIR,
DELAY_BETWEEN_OUTCODES,
HOMECOUK_CONCURRENCY,
RELOAD_URL,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
@ -503,59 +505,133 @@ def run_scrape(
hk_start = start_indices.get("hk", 0)
if hk_start > 0:
log.info("home.co.uk resuming from outcode %d/%d", hk_start, len(shuffled))
client = make_homecouk_client(*hk_result)
log.info("home.co.uk scraping ENABLED")
log.info(
"home.co.uk scraping ENABLED (concurrency=%d)", HOMECOUK_CONCURRENCY
)
homecouk_enabled.set(1)
try:
for i, outcode in enumerate(shuffled):
if i < hk_start:
continue
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
try:
props = homecouk_search_outcode(
client, outcode, ch, pc_index
)
hk_results[ch].extend(props)
if props:
log.info("home.co.uk %s: +%d properties", outcode, len(props))
except CookiesExpiredError:
log.warning(
"home.co.uk cookies expired — attempting refresh"
)
client.close()
hk_new = load_homecouk_cookies()
if hk_new:
client = make_homecouk_client(*hk_new)
log.info("home.co.uk cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning(
"Cookie refresh failed, disabling home.co.uk"
)
homecouk_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append(
"home.co.uk cookies expired and refresh failed"
)
progress.update("hk", len(shuffled))
return
except Exception as e:
log.error("home.co.uk %s/%s: %s", outcode, ch, e)
scrape_errors_total.labels(source="homecouk").inc()
progress.update("hk", i + 1)
time.sleep(DELAY_BETWEEN_OUTCODES)
# Shared state across pool threads
cookie_state = {
"cookies": hk_result[0],
"user_agent": hk_result[1],
"generation": 0,
}
cookie_lock = threading.Lock()
results_lock = threading.Lock()
completed_count = [hk_start]
disabled = [False]
_local = threading.local()
def _get_client():
"""Get or create a thread-local curl_cffi session."""
with cookie_lock:
gen = cookie_state["generation"]
cookies = cookie_state["cookies"]
ua = cookie_state["user_agent"]
if not hasattr(_local, "client") or _local.gen != gen:
if hasattr(_local, "client"):
try:
_local.client.close()
except Exception:
pass
_local.client = make_homecouk_client(cookies, ua)
_local.gen = gen
return _local.client
def _refresh_cookies():
"""Refresh cookies via FlareSolverr. Thread-safe with generation check."""
with cookie_lock:
pre_gen = cookie_state["generation"]
new = load_homecouk_cookies()
if not new:
return False
with cookie_lock:
if cookie_state["generation"] == pre_gen:
cookie_state["cookies"] = new[0]
cookie_state["user_agent"] = new[1]
cookie_state["generation"] += 1
cookie_refreshes_total.labels(result="success").inc()
log.info("home.co.uk cookies refreshed")
return True
def _scrape_outcode(outcode):
if disabled[0]:
return
client = _get_client()
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
if disabled[0]:
return
try:
props = homecouk_search_outcode(
client, outcode, ch, pc_index
)
if props:
with results_lock:
hk_results[ch].extend(props)
log.info(
"home.co.uk %s: +%d properties", outcode, len(props)
)
except CookiesExpiredError:
log.warning(
"home.co.uk cookies expired — attempting refresh"
)
if _refresh_cookies():
client = _get_client()
try:
props = homecouk_search_outcode(
client, outcode, ch, pc_index
)
if props:
with results_lock:
hk_results[ch].extend(props)
log.info(
"home.co.uk %s: +%d properties",
outcode,
len(props),
)
except Exception as e:
log.error(
"home.co.uk %s/%s (after refresh): %s",
outcode,
ch,
e,
)
scrape_errors_total.labels(source="homecouk").inc()
else:
log.warning(
"Cookie refresh failed, disabling home.co.uk"
)
disabled[0] = True
homecouk_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append(
"home.co.uk cookies expired and refresh failed"
)
return
except Exception as e:
log.error("home.co.uk %s/%s: %s", outcode, ch, e)
scrape_errors_total.labels(source="homecouk").inc()
with results_lock:
completed_count[0] += 1
progress.update("hk", completed_count[0])
time.sleep(DELAY_BETWEEN_OUTCODES)
try:
work = [oc for i, oc in enumerate(shuffled) if i >= hk_start]
with ThreadPoolExecutor(
max_workers=HOMECOUK_CONCURRENCY
) as pool:
list(pool.map(_scrape_outcode, work))
except Exception as e:
log.exception("Fatal home.co.uk error: %s", e)
with status_lock:
status.errors.append(f"Fatal home.co.uk: {e}")
finally:
try:
client.close()
except Exception:
pass
if disabled[0]:
progress.update("hk", len(shuffled))
def or_worker():
or_result = load_openrent_cookies()