scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -15,6 +15,10 @@ from constants import (
|
|||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
LONDON_OUTCODE_PREFIXES,
|
||||
ZOOPLA_DETAIL_BUDGET_FRACTION,
|
||||
ZOOPLA_FETCH_DETAILS,
|
||||
ZOOPLA_FETCHER,
|
||||
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||
)
|
||||
|
||||
from http_client import make_client
|
||||
|
|
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
|
|||
return timeout
|
||||
|
||||
|
||||
def _zoopla_detail_cap() -> int:
|
||||
"""Max detail-page fetches per outcode (0 disables detail fetching).
|
||||
|
||||
Zoopla search cards only expose an outcode-level address, so the full
|
||||
postcode/coordinates come from each listing's detail page. The cap bounds
|
||||
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
|
||||
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
|
||||
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
|
||||
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
|
||||
|
||||
|
||||
def _open_zoopla_detail_tab(page, detail_cap: int):
|
||||
"""Open a second tab on the same context for detail-page fetches.
|
||||
|
||||
Sharing the persistent context means the detail tab inherits the search
|
||||
tab's Cloudflare clearance cookies. Returns None when detail fetching is
|
||||
disabled or the tab cannot be created (the scrape then degrades to
|
||||
outcode-level postcodes rather than failing)."""
|
||||
if detail_cap <= 0:
|
||||
return None
|
||||
try:
|
||||
return page.context.new_page()
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
|
||||
_exception_detail(exc),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _wall_clock_timeout(seconds: int, label: str):
|
||||
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
||||
|
|
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
|
|||
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
||||
|
||||
|
||||
def _scrape_zoopla_flaresolverr(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
results: dict[str, list[dict]],
|
||||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
|
||||
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
||||
from zoopla_flaresolverr import search_outcode as fs_search_outcode
|
||||
|
||||
try:
|
||||
session = FlareSolverrSession(session="zoopla")
|
||||
session.__enter__()
|
||||
except FlareSolverrError as exc:
|
||||
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
|
||||
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
|
||||
return
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
|
||||
if remaining == 0:
|
||||
log.info("Zoopla cap reached")
|
||||
return
|
||||
try:
|
||||
props, _ = fs_search_outcode(
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
session,
|
||||
max_properties=remaining,
|
||||
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||
)
|
||||
added = _store_properties(results, "zoopla", props, max_properties_per_source)
|
||||
log.info("Zoopla %s: +%d", outcode, added)
|
||||
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
|
||||
_record_error(errors, "zoopla", outcode, exc)
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
session.__exit__(None, None, None)
|
||||
|
||||
|
||||
def _scrape_zoopla(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -446,6 +524,12 @@ def _scrape_zoopla(
|
|||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
if ZOOPLA_FETCHER == "flaresolverr":
|
||||
_scrape_zoopla_flaresolverr(
|
||||
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
browser, page = _launch_zoopla_with_retries()
|
||||
except Exception as exc:
|
||||
|
|
@ -454,6 +538,12 @@ def _scrape_zoopla(
|
|||
return
|
||||
|
||||
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
||||
detail_cap = _zoopla_detail_cap()
|
||||
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||
# Spend at most a fraction of each outcode's budget on detail fetches so the
|
||||
# SIGALRM guard never trips mid-outcode and discards already-collected
|
||||
# search listings; the rest is left for search pagination and transform.
|
||||
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
|
|
@ -470,6 +560,9 @@ def _scrape_zoopla(
|
|||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=None,
|
||||
detail_page=detail_page,
|
||||
detail_cap=detail_cap,
|
||||
detail_budget_seconds=detail_budget_seconds,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -496,6 +589,8 @@ def _scrape_zoopla(
|
|||
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
||||
try:
|
||||
browser, page = _launch_zoopla_with_retries()
|
||||
# The old context (and its detail tab) is gone; reopen one.
|
||||
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||
log.info("Zoopla %s retrying with fresh browser", outcode)
|
||||
except Exception as relaunch_exc:
|
||||
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
||||
|
|
@ -503,6 +598,11 @@ def _scrape_zoopla(
|
|||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
if detail_page is not None:
|
||||
try:
|
||||
detail_page.close()
|
||||
except Exception:
|
||||
pass
|
||||
_close_zoopla_browser(browser, "zoopla final")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue