"""Zoopla scraping via FlareSolverr (no browser/VNC needed). FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which still contains the React Server Components flight stream — so the existing pure parsers work unchanged: - the search page yields the outcode's listing detail URLs, and - each detail page's flight stream carries the property's location object (postcode + coordinates) that ``parse_detail_geo`` extracts, plus the listing fields (price/beds/baths/tenure/floor area) parsed here. Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm FlareSolverr session solves the SW9 search + detail pages and the flight data is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819). This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox". """ import logging import re import time from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE from flaresolverr import FlareSolverrError, FlareSolverrSession from spatial import PostcodeSpatialIndex from zoopla import _url_with_page, parse_detail_geo, transform_property log = logging.getLogger("zoopla") # Safety bound on how many search-result pages to walk per outcode. _MAX_SERP_PAGES = 60 _DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/") _LISTING_ID_RE = re.compile(r"/details/(\d+)/") def _int(pattern: str, buf: str) -> int | None: match = re.search(pattern, buf) return int(match.group(1)) if match else None def parse_detail_listing(html: str) -> dict: """Extract the non-location listing fields from a Zoopla detail page. Mirrors the fields the Camoufox SERP-card extractor produced, read from the detail page's flight stream (validated against real Zoopla detail HTML). All fields are best-effort; missing ones default to None so a listing with a known location is still emitted.""" buf = html.replace('\\"', '"').replace("\\/", "/") price = _int(r'"internalValue":(\d+)', buf) if price is None: price = _int(r'"priceUnformatted":(\d+)', buf) tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf) tenure = tenure_match.group(1).title() if tenure_match else None # Address + property type come from the page , e.g. # "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla" address = None property_type = None title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf) if title_match: title = title_match.group(1) addr_match = re.match(r"(.+?),\s*\d+\s*bed", title) if addr_match: address = addr_match.group(1).strip() type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title) if type_match: property_type = type_match.group(1).strip() explicit_type = re.search(r'"propertyType":"([^"]+)"', buf) if explicit_type: property_type = explicit_type.group(1) return { "price": price, "beds": _int(r'"numBedrooms":(\d+)', buf), "baths": _int(r'"numBaths":(\d+)', buf), "receptions": _int(r'"numLivingRooms":(\d+)', buf), "floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf), "tenure": tenure, "property_type": property_type, "address": address, } def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]: """Walk the outcode's search-result pages and collect listing detail paths.""" base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home" seen: list[str] = [] seen_ids: set[str] = set() for page_num in range(1, _MAX_SERP_PAGES + 1): url = base if page_num == 1 else _url_with_page(base, page_num) html = fs.get(url) new = 0 for path in _DETAIL_PATH_RE.findall(html): id_match = _LISTING_ID_RE.search(path) listing_id = id_match.group(1) if id_match else path if listing_id in seen_ids: continue seen_ids.add(listing_id) seen.append(path) new += 1 if limit is not None and len(seen) >= limit: return seen if new == 0: break time.sleep(DELAY_BETWEEN_PAGES) return seen def search_outcode( outcode: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], fs: FlareSolverrSession, max_properties: int | None = None, detail_cap: int = 0, detail_budget_seconds: float | None = None, ) -> tuple[list[dict], str | None]: """Scrape one outcode via FlareSolverr. Returns (properties, search_url). Every listing's detail page is fetched (that is where the postcode lives), so the effective listing count is bounded by both ``max_properties`` and ``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details.""" limit = detail_cap if detail_cap and detail_cap > 0 else None if max_properties is not None: limit = max_properties if limit is None else min(limit, max_properties) base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home" paths = _enumerate_detail_paths(fs, outcode, limit) if not paths: return [], base deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None properties: list[dict] = [] dropped = 0 for path in paths: if deadline is not None and time.monotonic() >= deadline: log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties)) break id_match = _LISTING_ID_RE.search(path) listing_id = id_match.group(1) if id_match else path try: html = fs.get(ZOOPLA_BASE + path) geo = parse_detail_geo(html, search_outcode=outcode) raw = {"id": listing_id, "url": path, **parse_detail_listing(html)} prop = transform_property( raw, pc_index, pc_coords, search_outcode=outcode, detail=geo ) except FlareSolverrError as exc: log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc) prop = None except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc) prop = None if prop: properties.append(prop) else: dropped += 1 time.sleep(DELAY_BETWEEN_PAGES) log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped) return properties, base