Faster scraping
This commit is contained in:
parent
05b8ee06c1
commit
852bb3f3a7
4 changed files with 437 additions and 324 deletions
|
|
@ -146,6 +146,12 @@ services:
|
||||||
# networks:
|
# networks:
|
||||||
# - dev-network
|
# - dev-network
|
||||||
# restart: unless-stopped
|
# restart: unless-stopped
|
||||||
|
# healthcheck:
|
||||||
|
# test: ["CMD", "curl", "-f", "http://localhost:8191/health"]
|
||||||
|
# interval: 30s
|
||||||
|
# timeout: 5s
|
||||||
|
# retries: 3
|
||||||
|
# start_period: 30s
|
||||||
|
|
||||||
# finder:
|
# finder:
|
||||||
# build:
|
# build:
|
||||||
|
|
@ -161,8 +167,14 @@ services:
|
||||||
# gluetun:
|
# gluetun:
|
||||||
# condition: service_healthy
|
# condition: service_healthy
|
||||||
# flaresolverr:
|
# flaresolverr:
|
||||||
# condition: service_started
|
# condition: service_healthy
|
||||||
# restart: unless-stopped
|
# restart: unless-stopped
|
||||||
|
# healthcheck:
|
||||||
|
# test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"]
|
||||||
|
# interval: 30s
|
||||||
|
# timeout: 5s
|
||||||
|
# retries: 3
|
||||||
|
# start_period: 60s
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,8 @@ from pathlib import Path
|
||||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||||
DATA_DIR = Path("/app/data")
|
DATA_DIR = Path("/app/data")
|
||||||
PAGE_SIZE = 24
|
PAGE_SIZE = 24
|
||||||
DELAY_BETWEEN_PAGES = 1.0
|
DELAY_BETWEEN_PAGES = 0.5
|
||||||
DELAY_BETWEEN_OUTCODES = 2.0
|
DELAY_BETWEEN_OUTCODES = 1.0
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_BASE_DELAY = 2.0
|
RETRY_BASE_DELAY = 2.0
|
||||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||||
|
|
@ -67,6 +67,7 @@ PROPERTY_TYPE_MAP = {
|
||||||
"Apartment": "Flats/Maisonettes",
|
"Apartment": "Flats/Maisonettes",
|
||||||
"Penthouse": "Flats/Maisonettes",
|
"Penthouse": "Flats/Maisonettes",
|
||||||
"Ground Flat": "Flats/Maisonettes",
|
"Ground Flat": "Flats/Maisonettes",
|
||||||
|
"Duplex": "Flats/Maisonettes",
|
||||||
"Detached Bungalow": "Detached",
|
"Detached Bungalow": "Detached",
|
||||||
"Semi-Detached Bungalow": "Semi-Detached",
|
"Semi-Detached Bungalow": "Semi-Detached",
|
||||||
"Town House": "Terraced",
|
"Town House": "Terraced",
|
||||||
|
|
@ -75,9 +76,15 @@ PROPERTY_TYPE_MAP = {
|
||||||
"Bungalow": "Other",
|
"Bungalow": "Other",
|
||||||
"Cottage": "Other",
|
"Cottage": "Other",
|
||||||
"Park Home": "Other",
|
"Park Home": "Other",
|
||||||
|
"Mobile Home": "Other",
|
||||||
|
"Caravan": "Other",
|
||||||
|
"Lodge": "Other",
|
||||||
"Land": "Other",
|
"Land": "Other",
|
||||||
"Farm / Barn": "Other",
|
"Farm / Barn": "Other",
|
||||||
|
"Farm House": "Other",
|
||||||
"House": "Detached",
|
"House": "Detached",
|
||||||
|
"House of Multiple Occupation": "Flats/Maisonettes",
|
||||||
|
"House Share": "Other",
|
||||||
"Not Specified": "Other",
|
"Not Specified": "Other",
|
||||||
"Chalet": "Other",
|
"Chalet": "Other",
|
||||||
"Barn Conversion": "Other",
|
"Barn Conversion": "Other",
|
||||||
|
|
@ -85,9 +92,20 @@ PROPERTY_TYPE_MAP = {
|
||||||
"Character Property": "Other",
|
"Character Property": "Other",
|
||||||
"Cluster House": "Other",
|
"Cluster House": "Other",
|
||||||
"Retirement Property": "Flats/Maisonettes",
|
"Retirement Property": "Flats/Maisonettes",
|
||||||
|
"Parking": "Other",
|
||||||
"Plot": "Other",
|
"Plot": "Other",
|
||||||
"Garages": "Other",
|
"Garages": "Other",
|
||||||
"Mews": "Terraced",
|
"Mews": "Terraced",
|
||||||
|
"Property": "Other",
|
||||||
|
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
||||||
|
"house": "Detached",
|
||||||
|
"bungalow": "Other",
|
||||||
|
"townhouse": "Terraced",
|
||||||
|
"land": "Other",
|
||||||
|
"other": "Other",
|
||||||
|
"not-specified": "Other",
|
||||||
|
"retirement-property": "Flats/Maisonettes",
|
||||||
|
"equestrian-facility": "Other",
|
||||||
}
|
}
|
||||||
|
|
||||||
CHANNELS = [
|
CHANNELS = [
|
||||||
|
|
|
||||||
|
|
@ -788,7 +788,24 @@ def search_outcode(
|
||||||
for search_data in search_results:
|
for search_data in search_results:
|
||||||
detail_data = None
|
detail_data = None
|
||||||
|
|
||||||
if fetch_details and search_data.get("url"):
|
# Skip detail page if we already have coordinates or a resolvable postcode
|
||||||
|
has_coords = (
|
||||||
|
search_data.get("lat") is not None
|
||||||
|
and search_data.get("lng") is not None
|
||||||
|
)
|
||||||
|
has_resolvable_pc = (
|
||||||
|
search_data.get("postcode")
|
||||||
|
and pc_coords
|
||||||
|
and search_data["postcode"] in pc_coords
|
||||||
|
)
|
||||||
|
needs_detail = (
|
||||||
|
fetch_details
|
||||||
|
and search_data.get("url")
|
||||||
|
and not has_coords
|
||||||
|
and not has_resolvable_pc
|
||||||
|
)
|
||||||
|
|
||||||
|
if needs_detail:
|
||||||
detail_html = fetch_page(client, search_data["url"])
|
detail_html = fetch_page(client, search_data["url"])
|
||||||
if detail_html:
|
if detail_html:
|
||||||
detail_data = parse_property_detail(detail_html)
|
detail_data = parse_property_detail(detail_html)
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ class ScrapeStatus:
|
||||||
outcodes_total: int = 0
|
outcodes_total: int = 0
|
||||||
properties_buy: int = 0
|
properties_buy: int = 0
|
||||||
properties_rent: int = 0
|
properties_rent: int = 0
|
||||||
# Per-source counts for current channel
|
# Per-source counts (combined across channels)
|
||||||
rm_properties: int = 0
|
rm_properties: int = 0
|
||||||
hk_properties: int = 0
|
hk_properties: int = 0
|
||||||
or_properties: int = 0
|
or_properties: int = 0
|
||||||
|
|
@ -81,27 +81,26 @@ def _sync_gauges() -> None:
|
||||||
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
||||||
scrape_outcodes_done.set(status.outcodes_done)
|
scrape_outcodes_done.set(status.outcodes_done)
|
||||||
scrape_outcodes_total.set(status.outcodes_total)
|
scrape_outcodes_total.set(status.outcodes_total)
|
||||||
# Total properties (both sources combined)
|
|
||||||
scrape_properties_total.labels(channel="buy", source="total").set(
|
scrape_properties_total.labels(channel="buy", source="total").set(
|
||||||
status.properties_buy
|
status.properties_buy
|
||||||
)
|
)
|
||||||
scrape_properties_total.labels(channel="rent", source="total").set(
|
scrape_properties_total.labels(channel="rent", source="total").set(
|
||||||
status.properties_rent
|
status.properties_rent
|
||||||
)
|
)
|
||||||
# Per-source breakdown for current channel
|
# Per-source totals (across both channels)
|
||||||
ch = "buy" if status.channel == "BUY" else "rent"
|
for ch in ("buy", "rent"):
|
||||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(
|
scrape_properties_total.labels(channel=ch, source="rightmove").set(
|
||||||
status.rm_properties
|
status.rm_properties
|
||||||
)
|
)
|
||||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(
|
scrape_properties_total.labels(channel=ch, source="homecouk").set(
|
||||||
status.hk_properties
|
status.hk_properties
|
||||||
)
|
)
|
||||||
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
||||||
status.or_properties
|
status.or_properties
|
||||||
)
|
)
|
||||||
scrape_properties_total.labels(channel=ch, source="zoopla").set(
|
scrape_properties_total.labels(channel=ch, source="zoopla").set(
|
||||||
status.zp_properties
|
status.zp_properties
|
||||||
)
|
)
|
||||||
if status.started_at:
|
if status.started_at:
|
||||||
end = status.finished_at if status.finished_at else time.time()
|
end = status.finished_at if status.finished_at else time.time()
|
||||||
scrape_elapsed_seconds.set(end - status.started_at)
|
scrape_elapsed_seconds.set(end - status.started_at)
|
||||||
|
|
@ -179,28 +178,89 @@ def _dedup_key(p: dict) -> tuple:
|
||||||
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
|
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
|
||||||
|
|
||||||
|
|
||||||
|
class _Progress:
|
||||||
|
"""Thread-safe progress tracker for parallel source workers."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._counts: dict[str, int] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def update(self, source: str, done: int) -> None:
|
||||||
|
with self._lock:
|
||||||
|
self._counts[source] = done
|
||||||
|
|
||||||
|
def snapshot(self) -> dict[str, int]:
|
||||||
|
with self._lock:
|
||||||
|
return dict(self._counts)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_channel(
|
||||||
|
rm_props: list[dict],
|
||||||
|
hk_props: list[dict],
|
||||||
|
or_props: list[dict],
|
||||||
|
zp_props: list[dict],
|
||||||
|
) -> tuple[dict[str, dict], dict[str, int], int]:
|
||||||
|
"""Merge properties from all sources for one channel with cross-source dedup.
|
||||||
|
|
||||||
|
Rightmove has priority; other sources are checked for duplicates.
|
||||||
|
Returns (all_properties_by_id, per_source_counts, total_dedup_count).
|
||||||
|
"""
|
||||||
|
all_properties: dict[str, dict] = {}
|
||||||
|
seen_keys: set[tuple] = set()
|
||||||
|
counts = {"rm": 0, "hk": 0, "or": 0, "zp": 0}
|
||||||
|
total_dedup = 0
|
||||||
|
|
||||||
|
# Rightmove first (priority source)
|
||||||
|
for p in rm_props:
|
||||||
|
pid = p["id"]
|
||||||
|
if pid not in all_properties:
|
||||||
|
all_properties[pid] = p
|
||||||
|
seen_keys.add(_dedup_key(p))
|
||||||
|
counts["rm"] += 1
|
||||||
|
|
||||||
|
# Other sources (check for cross-source duplicates)
|
||||||
|
for source, props in [("hk", hk_props), ("or", or_props), ("zp", zp_props)]:
|
||||||
|
for p in props:
|
||||||
|
pid = p["id"]
|
||||||
|
key = _dedup_key(p)
|
||||||
|
if pid in all_properties or key in seen_keys:
|
||||||
|
total_dedup += 1
|
||||||
|
continue
|
||||||
|
all_properties[pid] = p
|
||||||
|
seen_keys.add(key)
|
||||||
|
counts[source] += 1
|
||||||
|
|
||||||
|
return all_properties, counts, total_dedup
|
||||||
|
|
||||||
|
|
||||||
def run_scrape(
|
def run_scrape(
|
||||||
outcodes: list[str],
|
outcodes: list[str],
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
pc_coords: dict[str, tuple[float, float]] | None = None,
|
pc_coords: dict[str, tuple[float, float]] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Main scrape loop — runs in background thread.
|
"""Main scrape orchestrator — runs all sources in parallel threads.
|
||||||
Scrapes Rightmove, home.co.uk, and OpenRent, merging into one dataset."""
|
|
||||||
|
Each source (Rightmove, home.co.uk, OpenRent, Zoopla) gets its own thread
|
||||||
|
that iterates all outcodes for both BUY and RENT channels. Results are
|
||||||
|
merged with cross-source deduplication after all workers complete.
|
||||||
|
"""
|
||||||
global status
|
global status
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "running"
|
status.state = "running"
|
||||||
status.started_at = time.time()
|
status.started_at = time.time()
|
||||||
|
status.finished_at = 0.0
|
||||||
status.errors = []
|
status.errors = []
|
||||||
status.properties_buy = 0
|
status.properties_buy = 0
|
||||||
status.properties_rent = 0
|
status.properties_rent = 0
|
||||||
|
status.channel = ""
|
||||||
|
status.outcode = ""
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
|
|
||||||
# Shuffle for geographic diversity
|
|
||||||
shuffled = list(outcodes)
|
shuffled = list(outcodes)
|
||||||
random.seed(SEED)
|
random.seed(SEED)
|
||||||
random.shuffle(shuffled)
|
random.shuffle(shuffled)
|
||||||
|
|
||||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
|
if not any([SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_ZOOPLA]):
|
||||||
log.warning("All scrapers disabled — nothing to do")
|
log.warning("All scrapers disabled — nothing to do")
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "done"
|
status.state = "done"
|
||||||
|
|
@ -208,373 +268,387 @@ def run_scrape(
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
return
|
return
|
||||||
|
|
||||||
client = make_client() if SCRAPE_RIGHTMOVE else None
|
|
||||||
if not SCRAPE_RIGHTMOVE:
|
if not SCRAPE_RIGHTMOVE:
|
||||||
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
|
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
|
||||||
|
|
||||||
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
|
|
||||||
hk_client = None
|
|
||||||
hk_failed = False
|
|
||||||
if not SCRAPE_HOMECOUK:
|
if not SCRAPE_HOMECOUK:
|
||||||
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
|
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
|
||||||
homecouk_enabled.set(0)
|
homecouk_enabled.set(0)
|
||||||
else:
|
|
||||||
hk_result = load_homecouk_cookies()
|
|
||||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
|
||||||
if hk_client:
|
|
||||||
log.info("home.co.uk scraping ENABLED")
|
|
||||||
homecouk_enabled.set(1)
|
|
||||||
else:
|
|
||||||
log.info(
|
|
||||||
"home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)"
|
|
||||||
)
|
|
||||||
homecouk_enabled.set(0)
|
|
||||||
|
|
||||||
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
|
|
||||||
or_client = None
|
|
||||||
or_failed = False
|
|
||||||
if not SCRAPE_OPENRENT:
|
if not SCRAPE_OPENRENT:
|
||||||
log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)")
|
log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)")
|
||||||
openrent_enabled.set(0)
|
openrent_enabled.set(0)
|
||||||
else:
|
|
||||||
or_result = load_openrent_cookies()
|
|
||||||
or_client = make_openrent_client(*or_result) if or_result else None
|
|
||||||
if or_client:
|
|
||||||
log.info("OpenRent scraping ENABLED")
|
|
||||||
openrent_enabled.set(1)
|
|
||||||
else:
|
|
||||||
log.info(
|
|
||||||
"OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)"
|
|
||||||
)
|
|
||||||
openrent_enabled.set(0)
|
|
||||||
|
|
||||||
# Zoopla: uses Camoufox browser (no cookies/client pattern)
|
|
||||||
zp_browser = None
|
|
||||||
zp_page = None
|
|
||||||
zp_failed = False
|
|
||||||
if not SCRAPE_ZOOPLA:
|
if not SCRAPE_ZOOPLA:
|
||||||
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
|
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
|
||||||
zoopla_enabled.set(0)
|
zoopla_enabled.set(0)
|
||||||
else:
|
|
||||||
try:
|
|
||||||
zp_browser, zp_page = launch_zoopla_browser()
|
|
||||||
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
|
|
||||||
zoopla_enabled.set(1)
|
|
||||||
except TurnstileError:
|
|
||||||
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
|
|
||||||
zoopla_enabled.set(0)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
|
|
||||||
zoopla_enabled.set(0)
|
|
||||||
|
|
||||||
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
|
# Build postcode coords if needed for OpenRent/Zoopla
|
||||||
if (or_client or zp_page) and pc_coords is None:
|
if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) and pc_coords is None:
|
||||||
pc_coords = build_postcode_coords()
|
pc_coords = build_postcode_coords()
|
||||||
|
|
||||||
try:
|
# Per-source result containers: {channel_name: [properties]}
|
||||||
for channel_cfg in CHANNELS:
|
# Each list is only written by its owning source thread.
|
||||||
channel_name = channel_cfg["channel"]
|
rm_results: dict[str, list] = {"BUY": [], "RENT": []}
|
||||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
hk_results: dict[str, list] = {"BUY": [], "RENT": []}
|
||||||
all_properties: dict[str, dict] = {} # dedup by id
|
or_results: dict[str, list] = {"BUY": [], "RENT": []}
|
||||||
seen_dedup_keys: set[tuple] = (
|
zp_results: dict[str, list] = {"BUY": [], "RENT": []}
|
||||||
set()
|
|
||||||
) # cross-source dedup by (postcode, beds, price)
|
|
||||||
rm_count = 0 # Rightmove properties this channel
|
|
||||||
hk_count = 0 # home.co.uk properties this channel
|
|
||||||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
|
||||||
or_count = 0 # OpenRent properties this channel
|
|
||||||
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
|
|
||||||
zp_count = 0 # Zoopla properties this channel
|
|
||||||
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
|
|
||||||
|
|
||||||
with status_lock:
|
progress = _Progress()
|
||||||
status.channel = channel_name
|
|
||||||
status.outcodes_done = 0
|
|
||||||
status.outcodes_total = len(shuffled)
|
|
||||||
status.rm_properties = 0
|
|
||||||
status.hk_properties = 0
|
|
||||||
status.or_properties = 0
|
|
||||||
status.zp_properties = 0
|
|
||||||
|
|
||||||
channel_start = time.time()
|
# --- Source worker closures ---
|
||||||
prev_prop_milestone = 0 # last 10k milestone we logged
|
# Each worker owns its client lifecycle and iterates all outcodes for both
|
||||||
log.info(
|
# channels. On auth failure, it refreshes cookies and continues. On fatal
|
||||||
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
|
# failure, it marks itself as done and returns partial results.
|
||||||
)
|
|
||||||
|
|
||||||
|
def rm_worker():
|
||||||
|
client = make_client()
|
||||||
|
try:
|
||||||
for i, outcode in enumerate(shuffled):
|
for i, outcode in enumerate(shuffled):
|
||||||
with status_lock:
|
try:
|
||||||
status.outcode = outcode
|
outcode_id = resolve_outcode_id(client, outcode)
|
||||||
status.outcodes_done = i
|
except Exception as e:
|
||||||
|
log.error("Rightmove %s ID lookup: %s", outcode, e)
|
||||||
|
scrape_errors_total.labels(source="rightmove").inc()
|
||||||
|
progress.update("rm", i + 1)
|
||||||
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
|
continue
|
||||||
|
|
||||||
made_requests = False
|
if not outcode_id:
|
||||||
|
log.debug("No Rightmove ID for %s, skipping", outcode)
|
||||||
|
progress.update("rm", i + 1)
|
||||||
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
|
continue
|
||||||
|
|
||||||
# --- Rightmove ---
|
for ch_cfg in CHANNELS:
|
||||||
if SCRAPE_RIGHTMOVE:
|
ch = ch_cfg["channel"]
|
||||||
made_requests = True
|
|
||||||
try:
|
try:
|
||||||
outcode_id = resolve_outcode_id(client, outcode)
|
props = search_outcode(
|
||||||
if not outcode_id:
|
client, outcode_id, outcode, ch_cfg, pc_index
|
||||||
log.debug(
|
|
||||||
"No Rightmove ID for outcode %s, skipping", outcode
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
props = search_outcode(
|
|
||||||
client, outcode_id, outcode, channel_cfg, pc_index
|
|
||||||
)
|
|
||||||
for p in props:
|
|
||||||
pid = p["id"]
|
|
||||||
if pid not in all_properties:
|
|
||||||
all_properties[pid] = p
|
|
||||||
seen_dedup_keys.add(_dedup_key(p))
|
|
||||||
rm_count += 1
|
|
||||||
except Exception as e:
|
|
||||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
|
||||||
log.error(msg)
|
|
||||||
scrape_errors_total.labels(source="rightmove").inc()
|
|
||||||
with status_lock:
|
|
||||||
status.errors.append(msg)
|
|
||||||
|
|
||||||
# --- home.co.uk ---
|
|
||||||
if hk_client and not hk_failed:
|
|
||||||
made_requests = True
|
|
||||||
try:
|
|
||||||
hk_props = homecouk_search_outcode(
|
|
||||||
hk_client,
|
|
||||||
outcode,
|
|
||||||
channel_name,
|
|
||||||
pc_index,
|
|
||||||
)
|
)
|
||||||
for p in hk_props:
|
rm_results[ch].extend(props)
|
||||||
pid = p["id"]
|
except Exception as e:
|
||||||
key = _dedup_key(p)
|
log.error("Rightmove %s/%s: %s", outcode, ch, e)
|
||||||
if pid in all_properties or key in seen_dedup_keys:
|
scrape_errors_total.labels(source="rightmove").inc()
|
||||||
hk_dedup_count += 1
|
|
||||||
cross_source_dedup_total.labels(
|
progress.update("rm", i + 1)
|
||||||
channel="buy" if channel_name == "BUY" else "rent",
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
).inc()
|
except Exception as e:
|
||||||
continue
|
log.exception("Fatal Rightmove error: %s", e)
|
||||||
all_properties[pid] = p
|
with status_lock:
|
||||||
seen_dedup_keys.add(key)
|
status.errors.append(f"Fatal Rightmove: {e}")
|
||||||
hk_count += 1
|
finally:
|
||||||
if hk_props:
|
client.close()
|
||||||
log.info(
|
|
||||||
"home.co.uk %s: +%d properties", outcode, len(hk_props)
|
def hk_worker():
|
||||||
)
|
hk_result = load_homecouk_cookies()
|
||||||
|
if not hk_result:
|
||||||
|
log.info("home.co.uk DISABLED (no cookies available)")
|
||||||
|
homecouk_enabled.set(0)
|
||||||
|
progress.update("hk", len(shuffled))
|
||||||
|
return
|
||||||
|
client = make_homecouk_client(*hk_result)
|
||||||
|
log.info("home.co.uk scraping ENABLED")
|
||||||
|
homecouk_enabled.set(1)
|
||||||
|
try:
|
||||||
|
for i, outcode in enumerate(shuffled):
|
||||||
|
for ch_cfg in CHANNELS:
|
||||||
|
ch = ch_cfg["channel"]
|
||||||
|
try:
|
||||||
|
props = homecouk_search_outcode(
|
||||||
|
client, outcode, ch, pc_index
|
||||||
|
)
|
||||||
|
hk_results[ch].extend(props)
|
||||||
|
if props:
|
||||||
|
log.info("home.co.uk %s: +%d properties", outcode, len(props))
|
||||||
except CookiesExpiredError:
|
except CookiesExpiredError:
|
||||||
log.warning(
|
log.warning(
|
||||||
"home.co.uk cookies expired — attempting refresh via FlareSolverr"
|
"home.co.uk cookies expired — attempting refresh"
|
||||||
)
|
)
|
||||||
hk_client.close()
|
client.close()
|
||||||
hk_result = load_homecouk_cookies()
|
hk_new = load_homecouk_cookies()
|
||||||
if hk_result:
|
if hk_new:
|
||||||
hk_client = make_homecouk_client(*hk_result)
|
client = make_homecouk_client(*hk_new)
|
||||||
log.info("home.co.uk cookies refreshed, continuing")
|
log.info("home.co.uk cookies refreshed, continuing")
|
||||||
cookie_refreshes_total.labels(result="success").inc()
|
cookie_refreshes_total.labels(result="success").inc()
|
||||||
else:
|
else:
|
||||||
log.warning(
|
log.warning(
|
||||||
"Cookie refresh failed, disabling home.co.uk for rest of scrape"
|
"Cookie refresh failed, disabling home.co.uk"
|
||||||
)
|
)
|
||||||
hk_client = None
|
|
||||||
hk_failed = True
|
|
||||||
homecouk_enabled.set(0)
|
homecouk_enabled.set(0)
|
||||||
cookie_refreshes_total.labels(result="failure").inc()
|
cookie_refreshes_total.labels(result="failure").inc()
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.errors.append(
|
status.errors.append(
|
||||||
"home.co.uk cookies expired and refresh failed"
|
"home.co.uk cookies expired and refresh failed"
|
||||||
)
|
)
|
||||||
|
progress.update("hk", len(shuffled))
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
|
log.error("home.co.uk %s/%s: %s", outcode, ch, e)
|
||||||
log.error(msg)
|
|
||||||
scrape_errors_total.labels(source="homecouk").inc()
|
scrape_errors_total.labels(source="homecouk").inc()
|
||||||
with status_lock:
|
|
||||||
status.errors.append(msg)
|
|
||||||
|
|
||||||
# --- OpenRent (RENT channel only) ---
|
progress.update("hk", i + 1)
|
||||||
if or_client and not or_failed and channel_name == "RENT":
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
made_requests = True
|
except Exception as e:
|
||||||
try:
|
log.exception("Fatal home.co.uk error: %s", e)
|
||||||
or_props = openrent_search_outcode(
|
with status_lock:
|
||||||
or_client,
|
status.errors.append(f"Fatal home.co.uk: {e}")
|
||||||
outcode,
|
finally:
|
||||||
pc_index,
|
try:
|
||||||
pc_coords,
|
client.close()
|
||||||
)
|
except Exception:
|
||||||
for p in or_props:
|
pass
|
||||||
pid = p["id"]
|
|
||||||
key = _dedup_key(p)
|
def or_worker():
|
||||||
if pid in all_properties or key in seen_dedup_keys:
|
or_result = load_openrent_cookies()
|
||||||
or_dedup_count += 1
|
if not or_result:
|
||||||
cross_source_dedup_total.labels(channel="rent").inc()
|
log.info("OpenRent DISABLED (no cookies available)")
|
||||||
continue
|
openrent_enabled.set(0)
|
||||||
all_properties[pid] = p
|
progress.update("or", len(shuffled))
|
||||||
seen_dedup_keys.add(key)
|
return
|
||||||
or_count += 1
|
client = make_openrent_client(*or_result)
|
||||||
if or_props:
|
log.info("OpenRent scraping ENABLED")
|
||||||
log.info(
|
openrent_enabled.set(1)
|
||||||
"OpenRent %s: +%d properties", outcode, len(or_props)
|
try:
|
||||||
)
|
for i, outcode in enumerate(shuffled):
|
||||||
except WafChallengeError:
|
# OpenRent is RENT-only
|
||||||
|
try:
|
||||||
|
props = openrent_search_outcode(
|
||||||
|
client, outcode, pc_index, pc_coords
|
||||||
|
)
|
||||||
|
or_results["RENT"].extend(props)
|
||||||
|
if props:
|
||||||
|
log.info("OpenRent %s: +%d properties", outcode, len(props))
|
||||||
|
except WafChallengeError:
|
||||||
|
log.warning(
|
||||||
|
"OpenRent WAF cookies expired — attempting refresh"
|
||||||
|
)
|
||||||
|
client.close()
|
||||||
|
or_new = load_openrent_cookies()
|
||||||
|
if or_new:
|
||||||
|
client = make_openrent_client(*or_new)
|
||||||
|
log.info("OpenRent cookies refreshed, continuing")
|
||||||
|
cookie_refreshes_total.labels(result="success").inc()
|
||||||
|
else:
|
||||||
log.warning(
|
log.warning(
|
||||||
"OpenRent WAF cookies expired — attempting refresh via FlareSolverr"
|
"Cookie refresh failed, disabling OpenRent"
|
||||||
)
|
)
|
||||||
or_client.close()
|
openrent_enabled.set(0)
|
||||||
or_result = load_openrent_cookies()
|
cookie_refreshes_total.labels(result="failure").inc()
|
||||||
if or_result:
|
|
||||||
or_client = make_openrent_client(*or_result)
|
|
||||||
log.info("OpenRent cookies refreshed, continuing")
|
|
||||||
cookie_refreshes_total.labels(result="success").inc()
|
|
||||||
else:
|
|
||||||
log.warning(
|
|
||||||
"Cookie refresh failed, disabling OpenRent for rest of scrape"
|
|
||||||
)
|
|
||||||
or_client = None
|
|
||||||
or_failed = True
|
|
||||||
openrent_enabled.set(0)
|
|
||||||
cookie_refreshes_total.labels(result="failure").inc()
|
|
||||||
with status_lock:
|
|
||||||
status.errors.append(
|
|
||||||
"OpenRent WAF cookies expired and refresh failed"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
|
|
||||||
log.error(msg)
|
|
||||||
scrape_errors_total.labels(source="openrent").inc()
|
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.errors.append(msg)
|
status.errors.append(
|
||||||
|
"OpenRent WAF cookies expired and refresh failed"
|
||||||
# --- Zoopla ---
|
|
||||||
if zp_page and not zp_failed:
|
|
||||||
made_requests = True
|
|
||||||
try:
|
|
||||||
zp_props = zoopla_search_outcode(
|
|
||||||
zp_page,
|
|
||||||
outcode,
|
|
||||||
channel_name,
|
|
||||||
pc_index,
|
|
||||||
pc_coords,
|
|
||||||
)
|
|
||||||
for p in zp_props:
|
|
||||||
pid = p["id"]
|
|
||||||
key = _dedup_key(p)
|
|
||||||
if pid in all_properties or key in seen_dedup_keys:
|
|
||||||
zp_dedup_count += 1
|
|
||||||
cross_source_dedup_total.labels(
|
|
||||||
channel="buy" if channel_name == "BUY" else "rent",
|
|
||||||
).inc()
|
|
||||||
continue
|
|
||||||
all_properties[pid] = p
|
|
||||||
seen_dedup_keys.add(key)
|
|
||||||
zp_count += 1
|
|
||||||
if zp_props:
|
|
||||||
log.info(
|
|
||||||
"Zoopla %s: +%d properties", outcode, len(zp_props)
|
|
||||||
)
|
)
|
||||||
|
progress.update("or", len(shuffled))
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
log.error("OpenRent %s: %s", outcode, e)
|
||||||
|
scrape_errors_total.labels(source="openrent").inc()
|
||||||
|
|
||||||
|
progress.update("or", i + 1)
|
||||||
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception("Fatal OpenRent error: %s", e)
|
||||||
|
with status_lock:
|
||||||
|
status.errors.append(f"Fatal OpenRent: {e}")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
client.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def zp_worker():
|
||||||
|
try:
|
||||||
|
browser, page = launch_zoopla_browser()
|
||||||
|
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
|
||||||
|
zoopla_enabled.set(1)
|
||||||
|
except TurnstileError:
|
||||||
|
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
|
||||||
|
zoopla_enabled.set(0)
|
||||||
|
progress.update("zp", len(shuffled))
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
|
||||||
|
zoopla_enabled.set(0)
|
||||||
|
progress.update("zp", len(shuffled))
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i, outcode in enumerate(shuffled):
|
||||||
|
for ch_cfg in CHANNELS:
|
||||||
|
ch = ch_cfg["channel"]
|
||||||
|
try:
|
||||||
|
props = zoopla_search_outcode(
|
||||||
|
page, outcode, ch, pc_index, pc_coords
|
||||||
|
)
|
||||||
|
zp_results[ch].extend(props)
|
||||||
|
if props:
|
||||||
|
log.info("Zoopla %s: +%d properties", outcode, len(props))
|
||||||
except TurnstileError:
|
except TurnstileError:
|
||||||
log.warning(
|
log.warning(
|
||||||
"Zoopla Cloudflare challenge failed — attempting browser relaunch"
|
"Zoopla Turnstile challenge — relaunching browser"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
zp_browser.close()
|
browser.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
zp_browser, zp_page = launch_zoopla_browser()
|
browser, page = launch_zoopla_browser()
|
||||||
log.info("Zoopla browser relaunched, continuing")
|
log.info("Zoopla browser relaunched, continuing")
|
||||||
except Exception:
|
except Exception:
|
||||||
log.warning(
|
log.warning(
|
||||||
"Browser relaunch failed, disabling Zoopla for rest of scrape"
|
"Browser relaunch failed, disabling Zoopla"
|
||||||
)
|
)
|
||||||
zp_page = None
|
|
||||||
zp_browser = None
|
|
||||||
zp_failed = True
|
|
||||||
zoopla_enabled.set(0)
|
zoopla_enabled.set(0)
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.errors.append(
|
status.errors.append(
|
||||||
"Zoopla Cloudflare challenge failed and browser relaunch failed"
|
"Zoopla Cloudflare challenge failed and relaunch failed"
|
||||||
)
|
)
|
||||||
|
progress.update("zp", len(shuffled))
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
|
log.error("Zoopla %s/%s: %s", outcode, ch, e)
|
||||||
log.error(msg)
|
|
||||||
scrape_errors_total.labels(source="zoopla").inc()
|
scrape_errors_total.labels(source="zoopla").inc()
|
||||||
with status_lock:
|
|
||||||
status.errors.append(msg)
|
|
||||||
|
|
||||||
with status_lock:
|
progress.update("zp", i + 1)
|
||||||
if channel_name == "BUY":
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
status.properties_buy = len(all_properties)
|
except Exception as e:
|
||||||
else:
|
log.exception("Fatal Zoopla error: %s", e)
|
||||||
status.properties_rent = len(all_properties)
|
with status_lock:
|
||||||
status.rm_properties = rm_count
|
status.errors.append(f"Fatal Zoopla: {e}")
|
||||||
status.hk_properties = hk_count
|
finally:
|
||||||
status.or_properties = or_count
|
try:
|
||||||
status.zp_properties = zp_count
|
browser.close()
|
||||||
_sync_gauges()
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Log progress every 100 outcodes
|
# --- Launch worker threads ---
|
||||||
done = i + 1
|
|
||||||
elapsed = time.time() - channel_start
|
|
||||||
if done % 100 == 0 or done == len(shuffled):
|
|
||||||
pct = done * 100 // len(shuffled)
|
|
||||||
rate = done / elapsed if elapsed > 0 else 0
|
|
||||||
log.info(
|
|
||||||
"%s %d/%d (%d%%) — %d props, %s elapsed, %.1f outcodes/min",
|
|
||||||
channel_name,
|
|
||||||
done,
|
|
||||||
len(shuffled),
|
|
||||||
pct,
|
|
||||||
len(all_properties),
|
|
||||||
_fmt_elapsed(elapsed),
|
|
||||||
rate * 60,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log when crossing a 10k property milestone
|
active_sources: list[str] = []
|
||||||
current_milestone = len(all_properties) // 10_000
|
threads: list[threading.Thread] = []
|
||||||
if current_milestone > prev_prop_milestone:
|
|
||||||
prev_prop_milestone = current_milestone
|
|
||||||
log.info(
|
|
||||||
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
|
|
||||||
channel_name,
|
|
||||||
current_milestone * 10,
|
|
||||||
rm_count,
|
|
||||||
hk_count,
|
|
||||||
or_count,
|
|
||||||
zp_count,
|
|
||||||
done,
|
|
||||||
len(shuffled),
|
|
||||||
_fmt_elapsed(elapsed),
|
|
||||||
)
|
|
||||||
|
|
||||||
if made_requests and i < len(shuffled) - 1:
|
if SCRAPE_RIGHTMOVE:
|
||||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
threads.append(threading.Thread(target=rm_worker, name="scrape-rm", daemon=True))
|
||||||
|
active_sources.append("rm")
|
||||||
|
if SCRAPE_HOMECOUK:
|
||||||
|
threads.append(threading.Thread(target=hk_worker, name="scrape-hk", daemon=True))
|
||||||
|
active_sources.append("hk")
|
||||||
|
if SCRAPE_OPENRENT:
|
||||||
|
threads.append(threading.Thread(target=or_worker, name="scrape-or", daemon=True))
|
||||||
|
active_sources.append("or")
|
||||||
|
if SCRAPE_ZOOPLA:
|
||||||
|
threads.append(threading.Thread(target=zp_worker, name="scrape-zp", daemon=True))
|
||||||
|
active_sources.append("zp")
|
||||||
|
|
||||||
# Write parquet
|
log.info(
|
||||||
deduped = list(all_properties.values())
|
"=== Starting scrape: %d outcodes, sources: %s ===",
|
||||||
|
len(shuffled),
|
||||||
|
", ".join(active_sources),
|
||||||
|
)
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
# --- Monitor progress while workers run ---
|
||||||
|
|
||||||
|
scrape_start = time.time()
|
||||||
|
last_log = 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
while any(t.is_alive() for t in threads):
|
||||||
|
snap = progress.snapshot()
|
||||||
|
min_done = min(
|
||||||
|
(snap.get(s, 0) for s in active_sources), default=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count properties across sources (safe: only one thread writes each list)
|
||||||
|
total_buy = sum(
|
||||||
|
len(r["BUY"]) for r in [rm_results, hk_results, or_results, zp_results]
|
||||||
|
)
|
||||||
|
total_rent = sum(
|
||||||
|
len(r["RENT"]) for r in [rm_results, hk_results, or_results, zp_results]
|
||||||
|
)
|
||||||
|
|
||||||
|
with status_lock:
|
||||||
|
status.outcodes_done = min_done
|
||||||
|
status.outcodes_total = len(shuffled)
|
||||||
|
status.properties_buy = total_buy
|
||||||
|
status.properties_rent = total_rent
|
||||||
|
status.rm_properties = len(rm_results["BUY"]) + len(rm_results["RENT"])
|
||||||
|
status.hk_properties = len(hk_results["BUY"]) + len(hk_results["RENT"])
|
||||||
|
status.or_properties = len(or_results["RENT"])
|
||||||
|
status.zp_properties = len(zp_results["BUY"]) + len(zp_results["RENT"])
|
||||||
|
_sync_gauges()
|
||||||
|
|
||||||
|
# Log progress every 30 seconds
|
||||||
|
now = time.time()
|
||||||
|
if now - last_log >= 30:
|
||||||
|
elapsed = now - scrape_start
|
||||||
|
per_source = ", ".join(
|
||||||
|
f"{s}:{snap.get(s, 0)}" for s in active_sources
|
||||||
|
)
|
||||||
|
log.info(
|
||||||
|
"Progress: %d/%d outcodes (%s), %d buy + %d rent props, %s elapsed",
|
||||||
|
min_done,
|
||||||
|
len(shuffled),
|
||||||
|
per_source,
|
||||||
|
total_buy,
|
||||||
|
total_rent,
|
||||||
|
_fmt_elapsed(elapsed),
|
||||||
|
)
|
||||||
|
last_log = now
|
||||||
|
|
||||||
|
time.sleep(5)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception("Monitor loop error: %s", e)
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
|
log.info("All source workers completed")
|
||||||
|
|
||||||
|
# --- Merge results per channel and write parquet ---
|
||||||
|
|
||||||
|
try:
|
||||||
|
for ch_cfg in CHANNELS:
|
||||||
|
ch = ch_cfg["channel"]
|
||||||
|
file_suffix = "buy" if ch == "BUY" else "rent"
|
||||||
|
|
||||||
|
merged, counts, total_dedup = _merge_channel(
|
||||||
|
rm_results[ch],
|
||||||
|
hk_results[ch],
|
||||||
|
or_results[ch],
|
||||||
|
zp_results[ch],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update cross-source dedup counter
|
||||||
|
ch_label = "buy" if ch == "BUY" else "rent"
|
||||||
|
if total_dedup:
|
||||||
|
cross_source_dedup_total.labels(channel=ch_label).inc(total_dedup)
|
||||||
|
|
||||||
|
deduped = list(merged.values())
|
||||||
output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet"
|
output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet"
|
||||||
write_parquet(deduped, output_path, channel=file_suffix)
|
write_parquet(deduped, output_path, channel=file_suffix)
|
||||||
|
|
||||||
with status_lock:
|
with status_lock:
|
||||||
if channel_name == "BUY":
|
if ch == "BUY":
|
||||||
status.properties_buy = len(deduped)
|
status.properties_buy = len(deduped)
|
||||||
else:
|
else:
|
||||||
status.properties_rent = len(deduped)
|
status.properties_rent = len(deduped)
|
||||||
status.outcodes_done = len(shuffled)
|
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
|
"=== %s complete: %d unique (rm:%d hk:%d or:%d zp:%d, cross-dedup:%d) ===",
|
||||||
channel_name,
|
ch,
|
||||||
len(deduped),
|
len(deduped),
|
||||||
rm_count,
|
counts["rm"],
|
||||||
hk_count,
|
counts["hk"],
|
||||||
or_count,
|
counts["or"],
|
||||||
zp_count,
|
counts["zp"],
|
||||||
hk_dedup_count + or_dedup_count + zp_dedup_count,
|
total_dedup,
|
||||||
)
|
)
|
||||||
|
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "done"
|
status.state = "done"
|
||||||
status.finished_at = time.time()
|
status.finished_at = time.time()
|
||||||
|
status.outcodes_done = len(shuffled)
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
elapsed = status.finished_at - status.started_at
|
elapsed = status.finished_at - status.started_at
|
||||||
log.info(
|
log.info(
|
||||||
|
|
@ -598,26 +672,18 @@ def run_scrape(
|
||||||
body.get("elapsed_ms", 0),
|
body.get("elapsed_ms", 0),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
log.warning("Server reload failed (%d): %s", resp.status_code, resp.text[:200])
|
log.warning(
|
||||||
|
"Server reload failed (%d): %s",
|
||||||
|
resp.status_code,
|
||||||
|
resp.text[:200],
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("Server reload request failed: %s", e)
|
log.warning("Server reload request failed: %s", e)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception("Fatal scrape error")
|
log.exception("Fatal scrape error during merge/write")
|
||||||
with status_lock:
|
with status_lock:
|
||||||
status.state = "error"
|
status.state = "error"
|
||||||
status.errors.append(f"Fatal: {e}")
|
status.errors.append(f"Fatal: {e}")
|
||||||
status.finished_at = time.time()
|
status.finished_at = time.time()
|
||||||
_sync_gauges()
|
_sync_gauges()
|
||||||
finally:
|
|
||||||
if client:
|
|
||||||
client.close()
|
|
||||||
if hk_client:
|
|
||||||
hk_client.close()
|
|
||||||
if or_client:
|
|
||||||
or_client.close()
|
|
||||||
if zp_browser:
|
|
||||||
try:
|
|
||||||
zp_browser.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue