Updates
This commit is contained in:
parent
7591e5fc05
commit
89a85e9a0c
22 changed files with 1006 additions and 899 deletions
|
|
@ -55,6 +55,7 @@ RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
|||
HOMECOUK_BASE = "https://home.co.uk"
|
||||
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
|
||||
HOMECOUK_PER_PAGE = 30 # max supported by the API
|
||||
HOMECOUK_CONCURRENCY = int(os.environ.get("HOMECOUK_CONCURRENCY", "4"))
|
||||
|
||||
# OpenRent
|
||||
OPENRENT_BASE = "https://www.openrent.co.uk"
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import logging
|
|||
import random
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import polars as pl
|
||||
|
|
@ -15,6 +16,7 @@ from constants import (
|
|||
CHECKPOINT_INTERVAL,
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
HOMECOUK_CONCURRENCY,
|
||||
RELOAD_URL,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
|
|
@ -503,59 +505,133 @@ def run_scrape(
|
|||
hk_start = start_indices.get("hk", 0)
|
||||
if hk_start > 0:
|
||||
log.info("home.co.uk resuming from outcode %d/%d", hk_start, len(shuffled))
|
||||
client = make_homecouk_client(*hk_result)
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
log.info(
|
||||
"home.co.uk scraping ENABLED (concurrency=%d)", HOMECOUK_CONCURRENCY
|
||||
)
|
||||
homecouk_enabled.set(1)
|
||||
try:
|
||||
for i, outcode in enumerate(shuffled):
|
||||
if i < hk_start:
|
||||
continue
|
||||
for ch_cfg in CHANNELS:
|
||||
ch = ch_cfg["channel"]
|
||||
try:
|
||||
props = homecouk_search_outcode(
|
||||
client, outcode, ch, pc_index
|
||||
)
|
||||
hk_results[ch].extend(props)
|
||||
if props:
|
||||
log.info("home.co.uk %s: +%d properties", outcode, len(props))
|
||||
except CookiesExpiredError:
|
||||
log.warning(
|
||||
"home.co.uk cookies expired — attempting refresh"
|
||||
)
|
||||
client.close()
|
||||
hk_new = load_homecouk_cookies()
|
||||
if hk_new:
|
||||
client = make_homecouk_client(*hk_new)
|
||||
log.info("home.co.uk cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling home.co.uk"
|
||||
)
|
||||
homecouk_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append(
|
||||
"home.co.uk cookies expired and refresh failed"
|
||||
)
|
||||
progress.update("hk", len(shuffled))
|
||||
return
|
||||
except Exception as e:
|
||||
log.error("home.co.uk %s/%s: %s", outcode, ch, e)
|
||||
scrape_errors_total.labels(source="homecouk").inc()
|
||||
|
||||
progress.update("hk", i + 1)
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
# Shared state across pool threads
|
||||
cookie_state = {
|
||||
"cookies": hk_result[0],
|
||||
"user_agent": hk_result[1],
|
||||
"generation": 0,
|
||||
}
|
||||
cookie_lock = threading.Lock()
|
||||
results_lock = threading.Lock()
|
||||
completed_count = [hk_start]
|
||||
disabled = [False]
|
||||
_local = threading.local()
|
||||
|
||||
def _get_client():
|
||||
"""Get or create a thread-local curl_cffi session."""
|
||||
with cookie_lock:
|
||||
gen = cookie_state["generation"]
|
||||
cookies = cookie_state["cookies"]
|
||||
ua = cookie_state["user_agent"]
|
||||
if not hasattr(_local, "client") or _local.gen != gen:
|
||||
if hasattr(_local, "client"):
|
||||
try:
|
||||
_local.client.close()
|
||||
except Exception:
|
||||
pass
|
||||
_local.client = make_homecouk_client(cookies, ua)
|
||||
_local.gen = gen
|
||||
return _local.client
|
||||
|
||||
def _refresh_cookies():
|
||||
"""Refresh cookies via FlareSolverr. Thread-safe with generation check."""
|
||||
with cookie_lock:
|
||||
pre_gen = cookie_state["generation"]
|
||||
new = load_homecouk_cookies()
|
||||
if not new:
|
||||
return False
|
||||
with cookie_lock:
|
||||
if cookie_state["generation"] == pre_gen:
|
||||
cookie_state["cookies"] = new[0]
|
||||
cookie_state["user_agent"] = new[1]
|
||||
cookie_state["generation"] += 1
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
log.info("home.co.uk cookies refreshed")
|
||||
return True
|
||||
|
||||
def _scrape_outcode(outcode):
|
||||
if disabled[0]:
|
||||
return
|
||||
client = _get_client()
|
||||
for ch_cfg in CHANNELS:
|
||||
ch = ch_cfg["channel"]
|
||||
if disabled[0]:
|
||||
return
|
||||
try:
|
||||
props = homecouk_search_outcode(
|
||||
client, outcode, ch, pc_index
|
||||
)
|
||||
if props:
|
||||
with results_lock:
|
||||
hk_results[ch].extend(props)
|
||||
log.info(
|
||||
"home.co.uk %s: +%d properties", outcode, len(props)
|
||||
)
|
||||
except CookiesExpiredError:
|
||||
log.warning(
|
||||
"home.co.uk cookies expired — attempting refresh"
|
||||
)
|
||||
if _refresh_cookies():
|
||||
client = _get_client()
|
||||
try:
|
||||
props = homecouk_search_outcode(
|
||||
client, outcode, ch, pc_index
|
||||
)
|
||||
if props:
|
||||
with results_lock:
|
||||
hk_results[ch].extend(props)
|
||||
log.info(
|
||||
"home.co.uk %s: +%d properties",
|
||||
outcode,
|
||||
len(props),
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(
|
||||
"home.co.uk %s/%s (after refresh): %s",
|
||||
outcode,
|
||||
ch,
|
||||
e,
|
||||
)
|
||||
scrape_errors_total.labels(source="homecouk").inc()
|
||||
else:
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling home.co.uk"
|
||||
)
|
||||
disabled[0] = True
|
||||
homecouk_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append(
|
||||
"home.co.uk cookies expired and refresh failed"
|
||||
)
|
||||
return
|
||||
except Exception as e:
|
||||
log.error("home.co.uk %s/%s: %s", outcode, ch, e)
|
||||
scrape_errors_total.labels(source="homecouk").inc()
|
||||
|
||||
with results_lock:
|
||||
completed_count[0] += 1
|
||||
progress.update("hk", completed_count[0])
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
try:
|
||||
work = [oc for i, oc in enumerate(shuffled) if i >= hk_start]
|
||||
with ThreadPoolExecutor(
|
||||
max_workers=HOMECOUK_CONCURRENCY
|
||||
) as pool:
|
||||
list(pool.map(_scrape_outcode, work))
|
||||
except Exception as e:
|
||||
log.exception("Fatal home.co.uk error: %s", e)
|
||||
with status_lock:
|
||||
status.errors.append(f"Fatal home.co.uk: {e}")
|
||||
finally:
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if disabled[0]:
|
||||
progress.update("hk", len(shuffled))
|
||||
|
||||
def or_worker():
|
||||
or_result = load_openrent_cookies()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue