More fixes
This commit is contained in:
parent
15fa09430b
commit
6b12e21d50
54 changed files with 1665 additions and 630 deletions
|
|
@ -17,6 +17,7 @@ from constants import (
|
|||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_ZOOPLA,
|
||||
SEED,
|
||||
)
|
||||
from homecouk import CookiesExpiredError
|
||||
|
|
@ -35,12 +36,16 @@ from metrics import (
|
|||
scrape_outcodes_total,
|
||||
scrape_properties_total,
|
||||
scrape_state,
|
||||
zoopla_enabled,
|
||||
)
|
||||
from openrent import WafChallengeError
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
from openrent import make_client as make_openrent_client
|
||||
from openrent import search_outcode as openrent_search_outcode
|
||||
from rightmove import resolve_outcode_id, search_outcode
|
||||
from zoopla import TurnstileError
|
||||
from zoopla import launch_browser as launch_zoopla_browser
|
||||
from zoopla import search_outcode as zoopla_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from storage import write_parquet
|
||||
|
||||
|
|
@ -60,6 +65,7 @@ class ScrapeStatus:
|
|||
rm_properties: int = 0
|
||||
hk_properties: int = 0
|
||||
or_properties: int = 0
|
||||
zp_properties: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
|
@ -93,6 +99,9 @@ def _sync_gauges() -> None:
|
|||
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
||||
status.or_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="zoopla").set(
|
||||
status.zp_properties
|
||||
)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
|
|
@ -191,7 +200,7 @@ def run_scrape(
|
|||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
|
||||
log.warning("All scrapers disabled — nothing to do")
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
|
|
@ -239,8 +248,27 @@ def run_scrape(
|
|||
)
|
||||
openrent_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent is active and caller didn't provide them
|
||||
if or_client and pc_coords is None:
|
||||
# Zoopla: uses Camoufox browser (no cookies/client pattern)
|
||||
zp_browser = None
|
||||
zp_page = None
|
||||
zp_failed = False
|
||||
if not SCRAPE_ZOOPLA:
|
||||
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
|
||||
zoopla_enabled.set(0)
|
||||
else:
|
||||
try:
|
||||
zp_browser, zp_page = launch_zoopla_browser()
|
||||
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
|
||||
zoopla_enabled.set(1)
|
||||
except TurnstileError:
|
||||
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
|
||||
zoopla_enabled.set(0)
|
||||
except Exception as e:
|
||||
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
|
||||
zoopla_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
|
||||
if (or_client or zp_page) and pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
||||
try:
|
||||
|
|
@ -256,6 +284,8 @@ def run_scrape(
|
|||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||
or_count = 0 # OpenRent properties this channel
|
||||
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
|
||||
zp_count = 0 # Zoopla properties this channel
|
||||
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
|
|
@ -264,6 +294,7 @@ def run_scrape(
|
|||
status.rm_properties = 0
|
||||
status.hk_properties = 0
|
||||
status.or_properties = 0
|
||||
status.zp_properties = 0
|
||||
|
||||
channel_start = time.time()
|
||||
prev_prop_milestone = 0 # last 10k milestone we logged
|
||||
|
|
@ -412,6 +443,63 @@ def run_scrape(
|
|||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
# --- Zoopla ---
|
||||
if zp_page and not zp_failed:
|
||||
made_requests = True
|
||||
try:
|
||||
zp_props = zoopla_search_outcode(
|
||||
zp_page,
|
||||
outcode,
|
||||
channel_name,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
for p in zp_props:
|
||||
pid = p["id"]
|
||||
key = _dedup_key(p)
|
||||
if pid in all_properties or key in seen_dedup_keys:
|
||||
zp_dedup_count += 1
|
||||
cross_source_dedup_total.labels(
|
||||
channel="buy" if channel_name == "BUY" else "rent",
|
||||
).inc()
|
||||
continue
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(key)
|
||||
zp_count += 1
|
||||
if zp_props:
|
||||
log.info(
|
||||
"Zoopla %s: +%d properties", outcode, len(zp_props)
|
||||
)
|
||||
except TurnstileError:
|
||||
log.warning(
|
||||
"Zoopla Cloudflare challenge failed — attempting browser relaunch"
|
||||
)
|
||||
try:
|
||||
zp_browser.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
zp_browser, zp_page = launch_zoopla_browser()
|
||||
log.info("Zoopla browser relaunched, continuing")
|
||||
except Exception:
|
||||
log.warning(
|
||||
"Browser relaunch failed, disabling Zoopla for rest of scrape"
|
||||
)
|
||||
zp_page = None
|
||||
zp_browser = None
|
||||
zp_failed = True
|
||||
zoopla_enabled.set(0)
|
||||
with status_lock:
|
||||
status.errors.append(
|
||||
"Zoopla Cloudflare challenge failed and browser relaunch failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="zoopla").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
|
|
@ -420,6 +508,7 @@ def run_scrape(
|
|||
status.rm_properties = rm_count
|
||||
status.hk_properties = hk_count
|
||||
status.or_properties = or_count
|
||||
status.zp_properties = zp_count
|
||||
_sync_gauges()
|
||||
|
||||
# Log progress every 100 outcodes
|
||||
|
|
@ -444,12 +533,13 @@ def run_scrape(
|
|||
if current_milestone > prev_prop_milestone:
|
||||
prev_prop_milestone = current_milestone
|
||||
log.info(
|
||||
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
|
||||
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
|
||||
channel_name,
|
||||
current_milestone * 10,
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
zp_count,
|
||||
done,
|
||||
len(shuffled),
|
||||
_fmt_elapsed(elapsed),
|
||||
|
|
@ -472,13 +562,14 @@ def run_scrape(
|
|||
_sync_gauges()
|
||||
|
||||
log.info(
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
|
||||
channel_name,
|
||||
len(deduped),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
hk_dedup_count + or_dedup_count,
|
||||
zp_count,
|
||||
hk_dedup_count + or_dedup_count + zp_dedup_count,
|
||||
)
|
||||
|
||||
with status_lock:
|
||||
|
|
@ -525,3 +616,8 @@ def run_scrape(
|
|||
hk_client.close()
|
||||
if or_client:
|
||||
or_client.close()
|
||||
if zp_browser:
|
||||
try:
|
||||
zp_browser.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue