"""Zoopla (zoopla.co.uk) scraper — buy and rental properties. Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably. Zoopla uses Next.js App Router with React Server Components (RSC). Search result data is server-rendered in an RSC stream, not available via __NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results — the working flow requires typing into the autocomplete input, selecting a suggestion, and clicking Search. Architecture: Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps a single Camoufox browser alive for the entire scrape. For each outcode, it: 1. Clears and types the outcode into the search input 2. Selects the first autocomplete suggestion 3. Clicks Search 4. Extracts listing data from the rendered DOM 5. Handles pagination via ?pn=N parameter The browser session replaces the cookie/client pattern used by other scrapers. """ import logging import re import time from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped from spatial import PostcodeSpatialIndex log = logging.getLogger("zoopla") class TurnstileError(Exception): """Raised when Cloudflare Turnstile challenge cannot be passed.""" # Maximum search result pages to scrape per outcode (25 listings/page) MAX_PAGES_PER_OUTCODE = 10 # JavaScript to extract listings from the rendered DOM. # Finds all detail links, walks up to the card container, and parses # price, beds, baths, floor area, address, and tenure from the card text. _EXTRACT_LISTINGS_JS = r"""() => { const links = Array.from(document.querySelectorAll( 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' )); const seen = new Set(); const results = []; for (const link of links) { const href = link.href; const match = href.match(/\/details\/(\d+)\//); if (!match) continue; const id = match[1]; if (seen.has(id)) continue; seen.add(id); // Walk up to the listing card container let card = link; for (let j = 0; j < 10; j++) { card = card.parentElement; if (!card) break; const text = card.innerText || ''; if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) { break; } } if (!card) continue; const text = card.innerText || ''; const lines = text.split('\n').map(l => l.trim()).filter(Boolean); const priceMatch = text.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i); let address = ''; for (const line of lines) { if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { address = line; break; } } let tenure = ''; if (/freehold/i.test(text)) tenure = 'Freehold'; else if (/leasehold/i.test(text)) tenure = 'Leasehold'; results.push({ id: id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, beds: bedsMatch ? parseInt(bedsMatch[1]) : null, baths: bathsMatch ? parseInt(bathsMatch[1]) : null, receptions: recMatch ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, address: address, tenure: tenure, }); } return results; }""" # JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM). _DISMISS_COOKIES_JS = """() => { const aside = document.querySelector('#usercentrics-cmp-ui'); if (aside && aside.shadowRoot) { const btns = aside.shadowRoot.querySelectorAll('button'); for (const btn of btns) { if (btn.innerText.includes('Accept')) { btn.click(); return true; } } } if (aside) { aside.remove(); return true; } return false; }""" # --------------------------------------------------------------------------- # Browser lifecycle # --------------------------------------------------------------------------- def launch_browser(): """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile, and dismiss cookie consent. Returns (browser, page) tuple. Raises TurnstileError if Cloudflare cannot be passed within 60 seconds. Caller must close browser when done.""" from camoufox.pkgman import camoufox_path # Verify camoufox is pre-installed — never download at runtime camoufox_path(download_if_missing=False) from camoufox.sync_api import Camoufox log.info("Launching Camoufox browser for Zoopla...") browser = Camoufox(headless=True).__enter__() page = browser.new_page() log.info("Navigating to Zoopla homepage...") page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000) # Wait for Cloudflare Turnstile to resolve. # Try clicking the Turnstile checkbox if present (helps in some cases). for i in range(20): if "Just a moment" not in page.title(): break # Attempt to click the Turnstile checkbox in the challenge iframe for frame in page.frames: if "challenges.cloudflare.com" in frame.url: try: iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]') if iframe_el: box = iframe_el.bounding_box() if box: page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2) except Exception: pass break time.sleep(3) else: page.close() browser.close() raise TurnstileError("Cloudflare Turnstile did not resolve after 60s") log.info("Cloudflare passed — title: %s", page.title()) time.sleep(2) # Dismiss cookie consent page.evaluate(_DISMISS_COOKIES_JS) time.sleep(1) return browser, page def _ensure_not_challenged(page) -> None: """Check if current page is a Cloudflare challenge and wait/raise.""" if "Just a moment" not in page.title(): return log.warning("Cloudflare challenge detected mid-session, waiting...") for i in range(20): time.sleep(3) if "Just a moment" not in page.title(): log.info("Cloudflare challenge resolved") return raise TurnstileError("Cloudflare re-challenge did not resolve") # --------------------------------------------------------------------------- # Search navigation # --------------------------------------------------------------------------- def _navigate_search(page, outcode: str, channel: str) -> bool: """Navigate to search results for an outcode via the homepage search flow. Returns True if results were found, False if no results or navigation failed. Raises TurnstileError if Cloudflare blocks us.""" # Navigate to homepage to reset search state page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000) time.sleep(2) _ensure_not_challenged(page) # Dismiss cookie consent (may reappear after navigation) page.evaluate(_DISMISS_COOKIES_JS) time.sleep(1) # Select Buy/Rent tab if channel == "RENT": rent_tab = page.query_selector( 'button:has-text("Rent"), [role="tab"]:has-text("Rent")' ) if rent_tab: rent_tab.click() time.sleep(0.5) # Find and fill search input search_input = page.query_selector( 'input[name="autosuggest-input"]' ) or page.query_selector('input[type="text"]') if not search_input: log.warning("Could not find search input on homepage") return False search_input.click() time.sleep(0.3) search_input.fill("") search_input.type(outcode, delay=60) time.sleep(2) # Select first autocomplete suggestion first_option = page.query_selector('[role="option"]') if not first_option: log.debug("No autocomplete suggestions for outcode %s", outcode) return False first_option.click() time.sleep(0.5) # Click search button search_btn = page.query_selector('button:has-text("Search")') if search_btn: search_btn.click() else: search_input.press("Enter") # Wait for results to load time.sleep(6) _ensure_not_challenged(page) return True def _get_result_count(page) -> int: """Extract the total results count from the page body text.""" try: body = page.inner_text("body") match = re.search(r"([\d,]+)\s+results?", body) if match: return int(match.group(1).replace(",", "")) except Exception: pass return 0 # --------------------------------------------------------------------------- # Extraction and pagination # --------------------------------------------------------------------------- def _extract_listings(page) -> list[dict]: """Extract listing data from the current search results page DOM.""" try: return page.evaluate(_EXTRACT_LISTINGS_JS) except Exception as e: log.warning("Failed to extract listings from DOM: %s", e) zoopla_errors_total.labels(type="extract_failed").inc() return [] def _paginate(page, total_results: int, channel: str) -> list[dict]: """Extract listings from all pages of search results. Page 1 is already loaded. For subsequent pages, clicks the Next button or navigates via URL parameter ?pn=N.""" all_listings = _extract_listings(page) channel_label = "buy" if channel == "BUY" else "rent" zoopla_pages_scraped.labels(channel=channel_label).inc() if not all_listings or total_results <= len(all_listings): return all_listings seen_ids = {listing["id"] for listing in all_listings} current_url = page.url page_num = 2 while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE: time.sleep(DELAY_BETWEEN_PAGES) # Try navigating via URL parameter if "?" in current_url: next_url = re.sub(r"[?&]pn=\d+", "", current_url) separator = "&" if "?" in next_url else "?" next_url = f"{next_url}{separator}pn={page_num}" else: next_url = f"{current_url}?pn={page_num}" try: page.goto(next_url, wait_until="domcontentloaded", timeout=30000) time.sleep(4) _ensure_not_challenged(page) except TurnstileError: raise except Exception as e: log.debug("Pagination navigation failed at page %d: %s", page_num, e) break page_listings = _extract_listings(page) if not page_listings: break # Deduplicate within this outcode new_count = 0 for listing in page_listings: if listing["id"] not in seen_ids: seen_ids.add(listing["id"]) all_listings.append(listing) new_count += 1 zoopla_pages_scraped.labels(channel=channel_label).inc() if new_count == 0: break # No new listings on this page page_num += 1 return all_listings # --------------------------------------------------------------------------- # Property transformation # --------------------------------------------------------------------------- def _extract_postcode(text: str) -> str | None: """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.""" match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE) if match: return match.group(1).upper().strip() return None def _extract_outcode(text: str) -> str | None: """Extract a UK outcode from address text like 'Whitechapel Road, London E1'.""" # Look for outcode at end of string or after last comma match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE) if match: return match.group(1).upper() # Try after comma parts = text.split(",") if len(parts) > 1: last = parts[-1].strip() match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE) if match: return match.group(1).upper() return None def _map_property_type(raw_type: str | None) -> str: """Map Zoopla property type text to canonical type.""" if not raw_type: return "Other" canonical = PROPERTY_TYPE_MAP.get(raw_type) if canonical: return canonical lower = raw_type.lower() if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: return "Flats/Maisonettes" if "detached" in lower and "semi" not in lower: return "Detached" if "semi" in lower: return "Semi-Detached" if "terrace" in lower or "mews" in lower: return "Terraced" if "house" in lower: return "Detached" return "Other" def transform_property( raw: dict, channel: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], ) -> dict | None: """Transform a raw Zoopla listing dict into the standard output schema. Zoopla search cards do not include coordinates, so we resolve lat/lng from postcodes extracted from the address text.""" price = raw.get("price") if not price: return None address = raw.get("address", "") # Resolve postcode and coordinates from address postcode = _extract_postcode(address) lat = lng = None if postcode: coords = pc_coords.get(postcode) if coords: lat, lng = coords if lat is None: # Try outcode-level fallback outcode = _extract_outcode(address) if outcode: prefix = outcode + " " for pcd, coords in pc_coords.items(): if pcd.startswith(prefix): postcode = pcd lat, lng = coords break if lat is None or lng is None or not postcode: return None # Validate coordinates are in England if not (49 <= lat <= 56 and -7 <= lng <= 2): return None bedrooms = raw.get("beds") or 0 bathrooms = raw.get("baths") or 0 receptions = raw.get("receptions") or 0 # Floor area: convert sq ft to sq m floor_area_sqm = None sqft = raw.get("floor_area_sqft") if sqft: floor_area_sqm = round(sqft * 0.092903, 1) listing_id = raw.get("id", "") listing_url = raw.get("url", "") if listing_url and not listing_url.startswith("http"): listing_url = ZOOPLA_BASE + listing_url return { "id": f"zp_{listing_id}", "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + receptions, "lon": lng, "lat": lat, "Postcode": postcode, "Address per Property Register": address, "Leasehold/Freehold": raw.get("tenure") or None, "Property type": "Other", # Not reliably extractable from Zoopla search cards "Property sub-type": "", "price": int(price), "price_frequency": "" if channel == "BUY" else "monthly", "Price qualifier": "", "Total floor area (sqm)": floor_area_sqm, "Listing URL": listing_url, "Listing features": [], "first_visible_date": "", } # --------------------------------------------------------------------------- # Top-level search function (called by scraper.py) # --------------------------------------------------------------------------- def search_outcode( page, outcode: str, channel: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], ) -> list[dict]: """Search Zoopla for properties in one outcode. Takes a live Camoufox Page (from launch_browser). Navigates through the search flow, extracts listings from rendered DOM, and transforms to the standard output schema. Raises TurnstileError if Cloudflare blocks us mid-session. """ if not _navigate_search(page, outcode, channel): return [] total_results = _get_result_count(page) if total_results == 0: return [] raw_listings = _paginate(page, total_results, channel) if not raw_listings: return [] channel_label = "buy" if channel == "BUY" else "rent" properties = [] for raw in raw_listings: transformed = transform_property(raw, channel, pc_index, pc_coords) if transformed: properties.append(transformed) zoopla_properties_scraped.labels(channel=channel_label).inc() return properties