Morning improvements

2026-03-17 13:29:03 +00:00 · 2026-03-17 13:29:03 +00:00 · 53fff3efaa
commit 53fff3efaa
parent 3e9fba5303
41 changed files with 2438 additions and 637 deletions
--- a/scripts/zoopla_experiment.py
+++ b/scripts/zoopla_experiment.py
@ -0,0 +1,319 @@
+#!/usr/bin/env -S uv run --project ../finder
+"""Zoopla scraping experiment — Playwright with stealth + network interception.
+
+Zoopla uses Next.js App Router with React Server Components. The listing data
+is NOT in __NEXT_DATA__ or the initial HTML — it's fetched client-side after
+hydration. This means we need a real browser that:
+  1. Passes Cloudflare's bot detection
+  2. Executes JavaScript to trigger the client-side data fetch
+  3. Intercepts the network response OR scrapes the rendered DOM
+
+Usage:
+    uv run --project finder scripts/zoopla_experiment.py [OUTCODE]
+"""
+
+import json
+import logging
+import re
+import sys
+import time
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-8s %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("zoopla-exp")
+
+ZOOPLA_BASE = "https://www.zoopla.co.uk"
+
+CHANNELS = {
+    "BUY": "for-sale",
+    "RENT": "to-rent",
+}
+
+
+def run_playwright_stealth(outcode: str, channel: str = "BUY"):
+    """Use Playwright with stealth patches to scrape Zoopla.
+
+    Strategy:
+    1. Launch stealth browser to bypass Cloudflare
+    2. Navigate to search page
+    3. Wait for listings to render (client-side hydration)
+    4. Try two extraction methods:
+       a. Intercept network requests for API data (cleanest)
+       b. Parse the rendered DOM (fallback)
+    """
+    from playwright.sync_api import sync_playwright
+    from playwright_stealth import Stealth
+
+    url_segment = CHANNELS[channel]
+    search_url = f"{ZOOPLA_BASE}/{url_segment}/properties/{outcode.lower()}/"
+    log.info("Target: %s", search_url)
+
+    intercepted_data = []
+
+    def handle_response(response):
+        """Capture any API responses that look like listing data."""
+        url = response.url
+        # Look for API/data endpoints
+        if any(kw in url for kw in ["/api/", "graphql", "search", "listing", "property"]):
+            try:
+                if "application/json" in (response.headers.get("content-type", "")):
+                    body = response.json()
+                    intercepted_data.append({"url": url, "data": body})
+                    log.info("  [intercepted] %s (%s)", url[:100], type(body).__name__)
+            except Exception:
+                pass
+
+    with sync_playwright() as p:
+        # Launch with stealth-friendly args
+        browser = p.chromium.launch(
+            headless=True,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+                "--disable-web-security",
+                "--lang=en-GB",
+            ],
+        )
+        context = browser.new_context(
+            locale="en-GB",
+            timezone_id="Europe/London",
+            viewport={"width": 1920, "height": 1080},
+            user_agent=(
+                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+            ),
+        )
+        page = context.new_page()
+
+        # Apply stealth patches (Linux platform, Chrome UA)
+        stealth = Stealth(
+            navigator_platform_override="Linux x86_64",
+            navigator_languages_override=("en-GB", "en"),
+        )
+        stealth.apply_stealth_sync(page)
+
+        # Listen for responses to intercept API data
+        page.on("response", handle_response)
+
+        # Navigate
+        log.info("Navigating to %s ...", search_url)
+        try:
+            page.goto(search_url, wait_until="domcontentloaded", timeout=60000)
+        except Exception as e:
+            log.error("Navigation failed: %s", e)
+            browser.close()
+            return
+
+        # Wait for Cloudflare to resolve
+        log.info("Waiting for Cloudflare challenge to resolve ...")
+        for attempt in range(20):
+            content = page.content()
+            title = page.title()
+            if "Just a moment" in content and "challenge" in content.lower():
+                log.info("  Cloudflare challenge still active (%d/20) title=%s", attempt + 1, title)
+                time.sleep(3)
+            else:
+                log.info("  Challenge resolved! title=%s", title)
+                break
+        else:
+            log.error("Cloudflare challenge did not resolve")
+            # Dump page content for debugging
+            print("\n=== Cloudflare challenge page ===")
+            print(page.content()[:3000])
+            browser.close()
+            return
+
+        # Wait for actual content to render
+        log.info("Waiting for listing content to render ...")
+        try:
+            # Try waiting for property cards to appear
+            page.wait_for_selector(
+                '[data-testid="search-result"], [data-testid="regular-listings"], '
+                '.listing-results, .css-kdnlof, [class*="ListingCard"], '
+                '[class*="listing"], [class*="PropertyCard"]',
+                timeout=15000,
+            )
+            log.info("Listing elements found in DOM!")
+        except Exception:
+            log.warning("No listing elements found by selector. Trying to wait for prices...")
+            try:
+                page.wait_for_function(
+                    "document.querySelectorAll('a[href*=\"/for-sale/details/\"]').length > 0",
+                    timeout=15000,
+                )
+                log.info("Listing links found in DOM!")
+            except Exception:
+                log.warning("No listing links either. Page may still be loading or we're blocked.")
+
+        # Give hydration a moment
+        time.sleep(3)
+
+        # --- Extraction Method A: Check intercepted network data ---
+        if intercepted_data:
+            print(f"\n=== Intercepted {len(intercepted_data)} API responses ===")
+            for item in intercepted_data:
+                print(f"\nURL: {item['url'][:150]}")
+                data = item["data"]
+                if isinstance(data, dict):
+                    print(f"Keys: {list(data.keys())[:15]}")
+                    # Look for listings inside
+                    for k, v in data.items():
+                        if isinstance(v, list) and len(v) > 2 and isinstance(v[0], dict):
+                            print(f"  {k}: list of {len(v)} items, [0] keys={list(v[0].keys())[:10]}")
+                elif isinstance(data, list) and data:
+                    print(f"Array of {len(data)} items")
+                    if isinstance(data[0], dict):
+                        print(f"  [0] keys: {list(data[0].keys())[:15]}")
+                print(json.dumps(data, indent=2, default=str, ensure_ascii=False)[:2000])
+
+        # --- Extraction Method B: Parse rendered DOM ---
+        log.info("Extracting from rendered DOM ...")
+
+        # Get full page content after hydration
+        content = page.content()
+
+        # Find listing URLs
+        listing_urls = re.findall(r'href="(/for-sale/details/\d+/[^"]*)"', content)
+        log.info("Found %d listing detail links", len(listing_urls))
+
+        # Find prices
+        prices = re.findall(r'£([\d,]+)', content)
+        log.info("Found %d price strings", len(prices))
+        if prices:
+            log.info("Prices: %s", prices[:10])
+
+        # Try to extract structured listing data from the page
+        listings = page.evaluate("""() => {
+            // Try to find listing cards via various selectors
+            const selectors = [
+                '[data-testid="search-result"]',
+                '[data-testid="regular-listings"] > div',
+                'a[href*="/for-sale/details/"]',
+                '[class*="ListingCard"]',
+                '[class*="listing-result"]',
+            ];
+
+            for (const sel of selectors) {
+                const elements = document.querySelectorAll(sel);
+                if (elements.length > 2) {
+                    return {
+                        selector: sel,
+                        count: elements.length,
+                        // Get text and href from first 3
+                        samples: Array.from(elements).slice(0, 3).map(el => ({
+                            text: el.innerText?.substring(0, 300),
+                            href: el.href || el.querySelector('a')?.href || '',
+                            html: el.outerHTML?.substring(0, 500),
+                        }))
+                    };
+                }
+            }
+
+            // Fallback: find all links to listing detail pages
+            const links = Array.from(document.querySelectorAll('a[href*="/details/"]'));
+            if (links.length > 0) {
+                return {
+                    selector: 'a[href*="/details/"]',
+                    count: links.length,
+                    samples: links.slice(0, 5).map(el => ({
+                        text: el.innerText?.substring(0, 300),
+                        href: el.href,
+                        parentText: el.closest('div, li, article')?.innerText?.substring(0, 500) || '',
+                    }))
+                };
+            }
+
+            // Last resort: get page structure
+            return {
+                selector: 'none',
+                count: 0,
+                bodyText: document.body?.innerText?.substring(0, 2000),
+                title: document.title,
+            };
+        }""")
+
+        print(f"\n=== DOM Extraction Results ===")
+        print(json.dumps(listings, indent=2, ensure_ascii=False)[:5000])
+
+        # Also extract cookies for potential reuse
+        cookies = context.cookies()
+        zoopla_cookies = {c["name"]: c["value"] for c in cookies if ".zoopla.co.uk" in c.get("domain", "")}
+        ua = page.evaluate("navigator.userAgent")
+
+        print(f"\n=== Session Info ===")
+        print(f"Cookies ({len(zoopla_cookies)}): {list(zoopla_cookies.keys())}")
+        print(f"User-Agent: {ua}")
+
+        if zoopla_cookies:
+            # Save cookies for reuse
+            print(f"\n=== Reusable cookie env vars ===")
+            for name, value in zoopla_cookies.items():
+                print(f"  {name}={value[:50]}...")
+
+        # --- Try a detail page if we found any listing URLs ---
+        if listing_urls:
+            detail_path = listing_urls[0]
+            detail_url = f"{ZOOPLA_BASE}{detail_path}"
+            log.info("--- Fetching detail page: %s ---", detail_url)
+            time.sleep(2)
+
+            page.goto(detail_url, wait_until="domcontentloaded", timeout=30000)
+            time.sleep(5)  # Let it hydrate
+
+            detail = page.evaluate("""() => {
+                const result = {};
+
+                // Price
+                const priceEl = document.querySelector('[data-testid="price"]')
+                    || document.querySelector('[class*="price"]');
+                result.price = priceEl?.innerText || '';
+
+                // Address
+                const addrEl = document.querySelector('[data-testid="address-label"]')
+                    || document.querySelector('h1') || document.querySelector('address');
+                result.address = addrEl?.innerText || '';
+
+                // Key features
+                const features = Array.from(document.querySelectorAll('[data-testid="listing_feature"] li, [class*="feature"] li'));
+                result.features = features.map(f => f.innerText).slice(0, 15);
+
+                // Bedrooms/bathrooms from icons or text
+                const specs = document.querySelectorAll('[data-testid="beds-label"], [data-testid="baths-label"], [class*="bed"], [class*="bath"]');
+                result.specs = Array.from(specs).map(s => s.innerText).slice(0, 5);
+
+                // Description
+                const desc = document.querySelector('[data-testid="listing_description"], [class*="description"]');
+                result.description = desc?.innerText?.substring(0, 500) || '';
+
+                // Agent
+                const agent = document.querySelector('[data-testid="agent-details"], [class*="agent"]');
+                result.agent = agent?.innerText?.substring(0, 200) || '';
+
+                // Full page text summary
+                result.pageTitle = document.title;
+                result.bodyPreview = document.body?.innerText?.substring(0, 1000);
+
+                return result;
+            }""")
+
+            print(f"\n=== Detail Page Data ===")
+            print(json.dumps(detail, indent=2, ensure_ascii=False)[:3000])
+
+        browser.close()
+
+
+def main():
+    outcode = sys.argv[1] if len(sys.argv) > 1 else "E1"
+    channel = "BUY"
+    log.info("=== Zoopla Scraping Experiment (Playwright Stealth) ===")
+    log.info("Outcode: %s, Channel: %s", outcode, channel)
+    run_playwright_stealth(outcode, channel)
+    log.info("=== Done ===")
+
+
+if __name__ == "__main__":
+    main()