More fixes

This commit is contained in:
Andras Schmelczer 2026-03-18 22:46:08 +00:00
parent 15fa09430b
commit 6b12e21d50
54 changed files with 1665 additions and 630 deletions

View file

@ -17,6 +17,7 @@ from constants import (
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
SCRAPE_ZOOPLA,
SEED,
)
from homecouk import CookiesExpiredError
@ -35,12 +36,16 @@ from metrics import (
scrape_outcodes_total,
scrape_properties_total,
scrape_state,
zoopla_enabled,
)
from openrent import WafChallengeError
from openrent import load_cookies as load_openrent_cookies
from openrent import make_client as make_openrent_client
from openrent import search_outcode as openrent_search_outcode
from rightmove import resolve_outcode_id, search_outcode
from zoopla import TurnstileError
from zoopla import launch_browser as launch_zoopla_browser
from zoopla import search_outcode as zoopla_search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
@ -60,6 +65,7 @@ class ScrapeStatus:
rm_properties: int = 0
hk_properties: int = 0
or_properties: int = 0
zp_properties: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
@ -93,6 +99,9 @@ def _sync_gauges() -> None:
scrape_properties_total.labels(channel=ch, source="openrent").set(
status.or_properties
)
scrape_properties_total.labels(channel=ch, source="zoopla").set(
status.zp_properties
)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
@ -191,7 +200,7 @@ def run_scrape(
random.seed(SEED)
random.shuffle(shuffled)
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
log.warning("All scrapers disabled — nothing to do")
with status_lock:
status.state = "done"
@ -239,8 +248,27 @@ def run_scrape(
)
openrent_enabled.set(0)
# Build postcode coords if OpenRent is active and caller didn't provide them
if or_client and pc_coords is None:
# Zoopla: uses Camoufox browser (no cookies/client pattern)
zp_browser = None
zp_page = None
zp_failed = False
if not SCRAPE_ZOOPLA:
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
zoopla_enabled.set(0)
else:
try:
zp_browser, zp_page = launch_zoopla_browser()
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
zoopla_enabled.set(1)
except TurnstileError:
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
zoopla_enabled.set(0)
except Exception as e:
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
zoopla_enabled.set(0)
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
if (or_client or zp_page) and pc_coords is None:
pc_coords = build_postcode_coords()
try:
@ -256,6 +284,8 @@ def run_scrape(
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
or_count = 0 # OpenRent properties this channel
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
zp_count = 0 # Zoopla properties this channel
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
with status_lock:
status.channel = channel_name
@ -264,6 +294,7 @@ def run_scrape(
status.rm_properties = 0
status.hk_properties = 0
status.or_properties = 0
status.zp_properties = 0
channel_start = time.time()
prev_prop_milestone = 0 # last 10k milestone we logged
@ -412,6 +443,63 @@ def run_scrape(
with status_lock:
status.errors.append(msg)
# --- Zoopla ---
if zp_page and not zp_failed:
made_requests = True
try:
zp_props = zoopla_search_outcode(
zp_page,
outcode,
channel_name,
pc_index,
pc_coords,
)
for p in zp_props:
pid = p["id"]
key = _dedup_key(p)
if pid in all_properties or key in seen_dedup_keys:
zp_dedup_count += 1
cross_source_dedup_total.labels(
channel="buy" if channel_name == "BUY" else "rent",
).inc()
continue
all_properties[pid] = p
seen_dedup_keys.add(key)
zp_count += 1
if zp_props:
log.info(
"Zoopla %s: +%d properties", outcode, len(zp_props)
)
except TurnstileError:
log.warning(
"Zoopla Cloudflare challenge failed — attempting browser relaunch"
)
try:
zp_browser.close()
except Exception:
pass
try:
zp_browser, zp_page = launch_zoopla_browser()
log.info("Zoopla browser relaunched, continuing")
except Exception:
log.warning(
"Browser relaunch failed, disabling Zoopla for rest of scrape"
)
zp_page = None
zp_browser = None
zp_failed = True
zoopla_enabled.set(0)
with status_lock:
status.errors.append(
"Zoopla Cloudflare challenge failed and browser relaunch failed"
)
except Exception as e:
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="zoopla").inc()
with status_lock:
status.errors.append(msg)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
@ -420,6 +508,7 @@ def run_scrape(
status.rm_properties = rm_count
status.hk_properties = hk_count
status.or_properties = or_count
status.zp_properties = zp_count
_sync_gauges()
# Log progress every 100 outcodes
@ -444,12 +533,13 @@ def run_scrape(
if current_milestone > prev_prop_milestone:
prev_prop_milestone = current_milestone
log.info(
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
channel_name,
current_milestone * 10,
rm_count,
hk_count,
or_count,
zp_count,
done,
len(shuffled),
_fmt_elapsed(elapsed),
@ -472,13 +562,14 @@ def run_scrape(
_sync_gauges()
log.info(
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
channel_name,
len(deduped),
rm_count,
hk_count,
or_count,
hk_dedup_count + or_dedup_count,
zp_count,
hk_dedup_count + or_dedup_count + zp_dedup_count,
)
with status_lock:
@ -525,3 +616,8 @@ def run_scrape(
hk_client.close()
if or_client:
or_client.close()
if zp_browser:
try:
zp_browser.close()
except Exception:
pass