scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -15,6 +15,10 @@ from constants import (
DATA_DIR,
DELAY_BETWEEN_OUTCODES,
LONDON_OUTCODE_PREFIXES,
ZOOPLA_DETAIL_BUDGET_FRACTION,
ZOOPLA_FETCH_DETAILS,
ZOOPLA_FETCHER,
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
)
from http_client import make_client
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
return timeout
def _zoopla_detail_cap() -> int:
"""Max detail-page fetches per outcode (0 disables detail fetching).
Zoopla search cards only expose an outcode-level address, so the full
postcode/coordinates come from each listing's detail page. The cap bounds
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
def _open_zoopla_detail_tab(page, detail_cap: int):
"""Open a second tab on the same context for detail-page fetches.
Sharing the persistent context means the detail tab inherits the search
tab's Cloudflare clearance cookies. Returns None when detail fetching is
disabled or the tab cannot be created (the scrape then degrades to
outcode-level postcodes rather than failing)."""
if detail_cap <= 0:
return None
try:
return page.context.new_page()
except Exception as exc:
log.warning(
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
_exception_detail(exc),
)
return None
@contextmanager
def _wall_clock_timeout(seconds: int, label: str):
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
def _scrape_zoopla_flaresolverr(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
from flaresolverr import FlareSolverrError, FlareSolverrSession
from zoopla_flaresolverr import search_outcode as fs_search_outcode
try:
session = FlareSolverrSession(session="zoopla")
session.__enter__()
except FlareSolverrError as exc:
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
return
try:
for outcode in outcodes:
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
if remaining == 0:
log.info("Zoopla cap reached")
return
try:
props, _ = fs_search_outcode(
outcode,
pc_index,
pc_coords,
session,
max_properties=remaining,
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
)
added = _store_properties(results, "zoopla", props, max_properties_per_source)
log.info("Zoopla %s: +%d", outcode, added)
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
_record_error(errors, "zoopla", outcode, exc)
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
session.__exit__(None, None, None)
def _scrape_zoopla(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -446,6 +524,12 @@ def _scrape_zoopla(
errors: list[str],
max_properties_per_source: int | None,
) -> None:
if ZOOPLA_FETCHER == "flaresolverr":
_scrape_zoopla_flaresolverr(
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
)
return
try:
browser, page = _launch_zoopla_with_retries()
except Exception as exc:
@ -454,6 +538,12 @@ def _scrape_zoopla(
return
outcode_timeout = _zoopla_outcode_timeout_seconds()
detail_cap = _zoopla_detail_cap()
detail_page = _open_zoopla_detail_tab(page, detail_cap)
# Spend at most a fraction of each outcode's budget on detail fetches so the
# SIGALRM guard never trips mid-outcode and discards already-collected
# search listings; the rest is left for search pagination and transform.
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
try:
for outcode in outcodes:
@ -470,6 +560,9 @@ def _scrape_zoopla(
pc_index,
pc_coords,
max_properties=None,
detail_page=detail_page,
detail_cap=detail_cap,
detail_budget_seconds=detail_budget_seconds,
)
added = _store_properties(
results,
@ -496,6 +589,8 @@ def _scrape_zoopla(
_close_zoopla_browser(browser, f"zoopla {outcode}")
try:
browser, page = _launch_zoopla_with_retries()
# The old context (and its detail tab) is gone; reopen one.
detail_page = _open_zoopla_detail_tab(page, detail_cap)
log.info("Zoopla %s retrying with fresh browser", outcode)
except Exception as relaunch_exc:
_record_error(errors, "zoopla", outcode, relaunch_exc)
@ -503,6 +598,11 @@ def _scrape_zoopla(
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
if detail_page is not None:
try:
detail_page.close()
except Exception:
pass
_close_zoopla_browser(browser, "zoopla final")