#!/usr/bin/env -S uv run --project ../finder """Zoopla scraping experiment — Playwright with stealth + network interception. Zoopla uses Next.js App Router with React Server Components. The listing data is NOT in __NEXT_DATA__ or the initial HTML — it's fetched client-side after hydration. This means we need a real browser that: 1. Passes Cloudflare's bot detection 2. Executes JavaScript to trigger the client-side data fetch 3. Intercepts the network response OR scrapes the rendered DOM Usage: uv run --project finder scripts/zoopla_experiment.py [OUTCODE] """ import json import logging import re import sys import time logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("zoopla-exp") ZOOPLA_BASE = "https://www.zoopla.co.uk" CHANNELS = { "BUY": "for-sale", "RENT": "to-rent", } def run_playwright_stealth(outcode: str, channel: str = "BUY"): """Use Playwright with stealth patches to scrape Zoopla. Strategy: 1. Launch stealth browser to bypass Cloudflare 2. Navigate to search page 3. Wait for listings to render (client-side hydration) 4. Try two extraction methods: a. Intercept network requests for API data (cleanest) b. Parse the rendered DOM (fallback) """ from playwright.sync_api import sync_playwright from playwright_stealth import Stealth url_segment = CHANNELS[channel] search_url = f"{ZOOPLA_BASE}/{url_segment}/properties/{outcode.lower()}/" log.info("Target: %s", search_url) intercepted_data = [] def handle_response(response): """Capture any API responses that look like listing data.""" url = response.url # Look for API/data endpoints if any(kw in url for kw in ["/api/", "graphql", "search", "listing", "property"]): try: if "application/json" in (response.headers.get("content-type", "")): body = response.json() intercepted_data.append({"url": url, "data": body}) log.info(" [intercepted] %s (%s)", url[:100], type(body).__name__) except Exception: pass with sync_playwright() as p: # Launch with stealth-friendly args browser = p.chromium.launch( headless=True, args=[ "--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage", "--disable-web-security", "--lang=en-GB", ], ) context = browser.new_context( locale="en-GB", timezone_id="Europe/London", viewport={"width": 1920, "height": 1080}, user_agent=( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ), ) page = context.new_page() # Apply stealth patches (Linux platform, Chrome UA) stealth = Stealth( navigator_platform_override="Linux x86_64", navigator_languages_override=("en-GB", "en"), ) stealth.apply_stealth_sync(page) # Listen for responses to intercept API data page.on("response", handle_response) # Navigate log.info("Navigating to %s ...", search_url) try: page.goto(search_url, wait_until="domcontentloaded", timeout=60000) except Exception as e: log.error("Navigation failed: %s", e) browser.close() return # Wait for Cloudflare to resolve log.info("Waiting for Cloudflare challenge to resolve ...") for attempt in range(20): content = page.content() title = page.title() if "Just a moment" in content and "challenge" in content.lower(): log.info(" Cloudflare challenge still active (%d/20) title=%s", attempt + 1, title) time.sleep(3) else: log.info(" Challenge resolved! title=%s", title) break else: log.error("Cloudflare challenge did not resolve") # Dump page content for debugging print("\n=== Cloudflare challenge page ===") print(page.content()[:3000]) browser.close() return # Wait for actual content to render log.info("Waiting for listing content to render ...") try: # Try waiting for property cards to appear page.wait_for_selector( '[data-testid="search-result"], [data-testid="regular-listings"], ' '.listing-results, .css-kdnlof, [class*="ListingCard"], ' '[class*="listing"], [class*="PropertyCard"]', timeout=15000, ) log.info("Listing elements found in DOM!") except Exception: log.warning("No listing elements found by selector. Trying to wait for prices...") try: page.wait_for_function( "document.querySelectorAll('a[href*=\"/for-sale/details/\"]').length > 0", timeout=15000, ) log.info("Listing links found in DOM!") except Exception: log.warning("No listing links either. Page may still be loading or we're blocked.") # Give hydration a moment time.sleep(3) # --- Extraction Method A: Check intercepted network data --- if intercepted_data: print(f"\n=== Intercepted {len(intercepted_data)} API responses ===") for item in intercepted_data: print(f"\nURL: {item['url'][:150]}") data = item["data"] if isinstance(data, dict): print(f"Keys: {list(data.keys())[:15]}") # Look for listings inside for k, v in data.items(): if isinstance(v, list) and len(v) > 2 and isinstance(v[0], dict): print(f" {k}: list of {len(v)} items, [0] keys={list(v[0].keys())[:10]}") elif isinstance(data, list) and data: print(f"Array of {len(data)} items") if isinstance(data[0], dict): print(f" [0] keys: {list(data[0].keys())[:15]}") print(json.dumps(data, indent=2, default=str, ensure_ascii=False)[:2000]) # --- Extraction Method B: Parse rendered DOM --- log.info("Extracting from rendered DOM ...") # Get full page content after hydration content = page.content() # Find listing URLs listing_urls = re.findall(r'href="(/for-sale/details/\d+/[^"]*)"', content) log.info("Found %d listing detail links", len(listing_urls)) # Find prices prices = re.findall(r'£([\d,]+)', content) log.info("Found %d price strings", len(prices)) if prices: log.info("Prices: %s", prices[:10]) # Try to extract structured listing data from the page listings = page.evaluate("""() => { // Try to find listing cards via various selectors const selectors = [ '[data-testid="search-result"]', '[data-testid="regular-listings"] > div', 'a[href*="/for-sale/details/"]', '[class*="ListingCard"]', '[class*="listing-result"]', ]; for (const sel of selectors) { const elements = document.querySelectorAll(sel); if (elements.length > 2) { return { selector: sel, count: elements.length, // Get text and href from first 3 samples: Array.from(elements).slice(0, 3).map(el => ({ text: el.innerText?.substring(0, 300), href: el.href || el.querySelector('a')?.href || '', html: el.outerHTML?.substring(0, 500), })) }; } } // Fallback: find all links to listing detail pages const links = Array.from(document.querySelectorAll('a[href*="/details/"]')); if (links.length > 0) { return { selector: 'a[href*="/details/"]', count: links.length, samples: links.slice(0, 5).map(el => ({ text: el.innerText?.substring(0, 300), href: el.href, parentText: el.closest('div, li, article')?.innerText?.substring(0, 500) || '', })) }; } // Last resort: get page structure return { selector: 'none', count: 0, bodyText: document.body?.innerText?.substring(0, 2000), title: document.title, }; }""") print(f"\n=== DOM Extraction Results ===") print(json.dumps(listings, indent=2, ensure_ascii=False)[:5000]) # Also extract cookies for potential reuse cookies = context.cookies() zoopla_cookies = {c["name"]: c["value"] for c in cookies if ".zoopla.co.uk" in c.get("domain", "")} ua = page.evaluate("navigator.userAgent") print(f"\n=== Session Info ===") print(f"Cookies ({len(zoopla_cookies)}): {list(zoopla_cookies.keys())}") print(f"User-Agent: {ua}") if zoopla_cookies: # Save cookies for reuse print(f"\n=== Reusable cookie env vars ===") for name, value in zoopla_cookies.items(): print(f" {name}={value[:50]}...") # --- Try a detail page if we found any listing URLs --- if listing_urls: detail_path = listing_urls[0] detail_url = f"{ZOOPLA_BASE}{detail_path}" log.info("--- Fetching detail page: %s ---", detail_url) time.sleep(2) page.goto(detail_url, wait_until="domcontentloaded", timeout=30000) time.sleep(5) # Let it hydrate detail = page.evaluate("""() => { const result = {}; // Price const priceEl = document.querySelector('[data-testid="price"]') || document.querySelector('[class*="price"]'); result.price = priceEl?.innerText || ''; // Address const addrEl = document.querySelector('[data-testid="address-label"]') || document.querySelector('h1') || document.querySelector('address'); result.address = addrEl?.innerText || ''; // Key features const features = Array.from(document.querySelectorAll('[data-testid="listing_feature"] li, [class*="feature"] li')); result.features = features.map(f => f.innerText).slice(0, 15); // Bedrooms/bathrooms from icons or text const specs = document.querySelectorAll('[data-testid="beds-label"], [data-testid="baths-label"], [class*="bed"], [class*="bath"]'); result.specs = Array.from(specs).map(s => s.innerText).slice(0, 5); // Description const desc = document.querySelector('[data-testid="listing_description"], [class*="description"]'); result.description = desc?.innerText?.substring(0, 500) || ''; // Agent const agent = document.querySelector('[data-testid="agent-details"], [class*="agent"]'); result.agent = agent?.innerText?.substring(0, 200) || ''; // Full page text summary result.pageTitle = document.title; result.bodyPreview = document.body?.innerText?.substring(0, 1000); return result; }""") print(f"\n=== Detail Page Data ===") print(json.dumps(detail, indent=2, ensure_ascii=False)[:3000]) browser.close() def main(): outcode = sys.argv[1] if len(sys.argv) > 1 else "E1" channel = "BUY" log.info("=== Zoopla Scraping Experiment (Playwright Stealth) ===") log.info("Outcode: %s, Channel: %s", outcode, channel) run_playwright_stealth(outcode, channel) log.info("=== Done ===") if __name__ == "__main__": main()