all good

2026-05-17 10:16:30 +01:00 · 2026-05-17 10:16:30 +01:00 · 017902b8e6
commit 017902b8e6
parent 47d89f6fad
82 changed files with 331466 additions and 54841 deletions
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -1,4 +1,4 @@
-"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
+"""Zoopla (zoopla.co.uk) scraper — sale properties.

 Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
 blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
@ -6,18 +6,14 @@ patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.

 Zoopla uses Next.js App Router with React Server Components (RSC). Search
 result data is server-rendered in an RSC stream, not available via
-__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
-the working flow requires typing into the autocomplete input, selecting a
-suggestion, and clicking Search.
+__NEXT_DATA__ or a JSON API.

 Architecture:
  Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
  a single Camoufox browser alive for the entire scrape. For each outcode, it:
-    1. Clears and types the outcode into the search input
-    2. Selects the first autocomplete suggestion
-    3. Clicks Search
-    4. Extracts listing data from the rendered DOM
-    5. Handles pagination via ?pn=N parameter
+    1. Navigates directly to the sale search URL
+    2. Extracts listing data from the rendered DOM
+    3. Handles pagination via ?pn=N parameter

  The browser session replaces the cookie/client pattern used by other scrapers.
 """
@ -27,7 +23,6 @@ import re
 import time

 from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
-from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
 from spatial import PostcodeSpatialIndex
 from transform import normalize_sub_type, validate_floor_area

@ -38,6 +33,25 @@ class TurnstileError(Exception):
    """Raised when Cloudflare Turnstile challenge cannot be passed."""


+class _ManagedCamoufoxBrowser:
+    def __init__(self, context_manager, browser):
+        self._context_manager = context_manager
+        self._browser = browser
+        self._closed = False
+
+    def close(self) -> None:
+        if self._closed:
+            return
+        self._closed = True
+        try:
+            self._browser.close()
+        finally:
+            self._context_manager.__exit__(None, None, None)
+
+    def __getattr__(self, name):
+        return getattr(self._browser, name)
+
+
 # Maximum search result pages to scrape per outcode (25 listings/page)
 MAX_PAGES_PER_OUTCODE = 40

@ -55,7 +69,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {

    for (const card of listingCards) {
        const link = card.querySelector(
-            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
        );
        if (!link) continue;

@ -100,9 +114,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {

        // Extract property type (e.g., "2 bed flat for sale" → "flat")
        let property_type = '';
-        const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
+        const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
        if (ptMatch) property_type = ptMatch[1].trim();
-        else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
+        else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';

        // Keyword fallback when regex doesn't match current DOM format
        if (!property_type) {
@ -135,7 +149,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
    // Strategy 2: Fall back to href-based link matching with parent-walking
    if (results.length === 0) {
        const links = Array.from(document.querySelectorAll(
-            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
        ));

        for (const link of links) {
@ -184,9 +198,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {

            // Extract property type
            let property_type = '';
-            const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
+            const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
            if (ptMatch2) property_type = ptMatch2[1].trim();
-            else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
+            else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';

            // Keyword fallback when regex doesn't match current DOM format
            if (!property_type) {
@ -243,17 +257,20 @@ def launch_browser():
    """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
    and dismiss cookie consent. Returns (browser, page) tuple.

-    Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
+    Raises TurnstileError if Cloudflare cannot be passed within two minutes.
    Caller must close browser when done."""
    from camoufox.pkgman import camoufox_path

-    # Verify camoufox is pre-installed — never download at runtime
-    camoufox_path(download_if_missing=False)
+    # Standalone local runs should not require the old container image to have
+    # pre-fetched Camoufox.
+    camoufox_path(download_if_missing=True)

    from camoufox.sync_api import Camoufox

    log.info("Launching Camoufox browser for Zoopla...")
-    browser = Camoufox(headless=True).__enter__()
+    camoufox = Camoufox(headless=True)
+    raw_browser = camoufox.__enter__()
+    browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
    page = browser.new_page()

    log.info("Navigating to Zoopla homepage...")
@ -261,7 +278,7 @@ def launch_browser():

    # Wait for Cloudflare Turnstile to resolve.
    # Try clicking the Turnstile checkbox if present (helps in some cases).
-    for i in range(20):
+    for i in range(40):
        if "Just a moment" not in page.title():
            break
        # Attempt to click the Turnstile checkbox in the challenge iframe
@ -280,7 +297,7 @@ def launch_browser():
    else:
        page.close()
        browser.close()
-        raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
+        raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")

    log.info("Cloudflare passed — title: %s", page.title())
    time.sleep(2)
@ -298,13 +315,13 @@ def _ensure_not_challenged(page) -> None:
        return

    log.warning("Cloudflare challenge detected mid-session, waiting...")
-    for i in range(20):
+    for i in range(40):
        time.sleep(3)
        if "Just a moment" not in page.title():
            log.info("Cloudflare challenge resolved")
            return

-    raise TurnstileError("Cloudflare re-challenge did not resolve")
+    raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")


 # ---------------------------------------------------------------------------
@ -312,21 +329,8 @@ def _ensure_not_challenged(page) -> None:
 # ---------------------------------------------------------------------------


-def _navigate_direct(page, url: str) -> bool:
-    """Navigate directly to a Zoopla search URL (skipping the homepage flow).
-
-    Used to load the second channel (e.g., RENT after BUY) for the same outcode
-    by swapping the path component. Falls back gracefully — returns False if
-    the page has no listings, so the caller can retry via the full search flow.
-    """
-    try:
-        page.goto(url, wait_until="domcontentloaded", timeout=30000)
-    except Exception as e:
-        log.debug("Direct navigation failed: %s", e)
-        return False
-    _ensure_not_challenged(page)
-
-    # Wait for listing content to hydrate
+def _wait_for_listing_content(page) -> None:
+    """Wait for rendered listing cards to contain usable text."""
    try:
        page.wait_for_function(
            """() => {
@ -343,100 +347,42 @@ def _navigate_direct(page, url: str) -> bool:
            timeout=8000,
        )
    except Exception:
-        # Check if the page has any listings at all
-        has_listings = page.query_selector('a[href*="/details/"]')
-        if not has_listings:
-            return False
        time.sleep(1.5)

-    return True

-
-def _navigate_search(page, outcode: str, channel: str) -> bool:
-    """Navigate to search results for an outcode via the homepage search flow.
+def _navigate_search(page, outcode: str) -> bool:
+    """Navigate directly to sale search results for an outcode.

    Returns True if results were found, False if no results or navigation failed.
    Raises TurnstileError if Cloudflare blocks us."""
-    # Navigate to homepage to reset search state
-    page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
-    time.sleep(0.5)
+    url = (
+        f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
+        f"?q={outcode}&search_source=home"
+    )
+    try:
+        page.goto(url, wait_until="domcontentloaded", timeout=30000)
+    except Exception as exc:
+        log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
+        return False
+
    _ensure_not_challenged(page)

    # Dismiss cookie consent (may reappear after navigation)
-    page.evaluate(_DISMISS_COOKIES_JS)
-    time.sleep(0.3)
+    try:
+        page.evaluate(_DISMISS_COOKIES_JS)
+    except Exception:
+        pass

-    # Select Buy/Rent tab
-    if channel == "RENT":
-        rent_tab = page.query_selector(
-            'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
-        )
-        if rent_tab:
-            rent_tab.click()
-            time.sleep(0.2)
-
-    # Find and fill search input
-    search_input = page.query_selector(
-        'input[name="autosuggest-input"]'
-    ) or page.query_selector('input[type="text"]')
-    if not search_input:
-        log.warning("Could not find search input on homepage")
-        return False
-
-    search_input.click()
-    time.sleep(0.1)
-    search_input.fill("")
-    search_input.type(outcode, delay=60)
-    time.sleep(1.2)
-
-    # Select first autocomplete suggestion
-    first_option = page.query_selector('[role="option"]')
-    if not first_option:
-        log.debug("No autocomplete suggestions for outcode %s", outcode)
-        return False
-
-    first_option.click()
-    time.sleep(0.2)
-
-    # Click search button
-    search_btn = page.query_selector('button:has-text("Search")')
-    if search_btn:
-        search_btn.click()
-    else:
-        search_input.press("Enter")
-
-    # Wait for results to load — try waiting for listings container, fall back to fixed wait
    try:
        page.wait_for_selector(
-            '[data-testid="regular-listings"], a[href*="/details/"]',
+            '[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
            timeout=10000,
        )
    except Exception:
-        time.sleep(4)
-    _ensure_not_challenged(page)
+        if not page.query_selector('a[href*="/details/"]'):
+            return False

-    # Wait for client-side hydration to populate listing content (prices, addresses).
-    # The structural container appears in server-rendered HTML before React hydrates
-    # the actual card content — extracting too early yields empty price/address fields.
-    try:
-        page.wait_for_function(
-            """() => {
-                const cards = document.querySelectorAll(
-                    '[data-testid="regular-listings"] > div'
-                );
-                if (cards.length === 0) return false;
-                for (const card of cards) {
-                    const t = card.innerText || '';
-                    if (t.includes('\\u00a3') && t.length > 50) return true;
-                }
-                return false;
-            }""",
-            timeout=8000,
-        )
-    except Exception:
-        # Content never appeared — extraction will likely fail but let it try
-        log.debug("Listing content hydration wait timed out — prices may not have rendered")
-        time.sleep(2)
+    _wait_for_listing_content(page)

    return True

@ -516,18 +462,21 @@ def _extract_listings(page) -> list[dict]:
        return listings
    except Exception as e:
        log.warning("Failed to extract listings from DOM: %s", e)
-        zoopla_errors_total.labels(type="extract_failed").inc()
        return []


-def _paginate(page, total_results: int, channel: str) -> list[dict]:
+def _paginate(
+    page,
+    total_results: int,
+    max_properties: int | None = None,
+) -> list[dict]:
    """Extract listings from all pages of search results.

    Page 1 is already loaded. For subsequent pages, clicks the Next button
    or navigates via URL parameter ?pn=N."""
    all_listings = _extract_listings(page)
-    channel_label = "buy" if channel == "BUY" else "rent"
-    zoopla_pages_scraped.labels(channel=channel_label).inc()
+    if max_properties is not None and len(all_listings) >= max_properties:
+        return all_listings[:max_properties]

    if not all_listings or total_results <= len(all_listings):
        return all_listings
@ -550,24 +499,7 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
        try:
            page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
            _ensure_not_challenged(page)
-            # Wait for listing content instead of fixed sleep
-            try:
-                page.wait_for_function(
-                    """() => {
-                        const cards = document.querySelectorAll(
-                            '[data-testid="regular-listings"] > div'
-                        );
-                        if (cards.length === 0) return false;
-                        for (const card of cards) {
-                            const t = card.innerText || '';
-                            if (t.includes('\\u00a3') && t.length > 50) return true;
-                        }
-                        return false;
-                    }""",
-                    timeout=8000,
-                )
-            except Exception:
-                time.sleep(1.5)
+            _wait_for_listing_content(page)
        except TurnstileError:
            raise
        except Exception as e:
@ -585,8 +517,8 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
                seen_ids.add(listing["id"])
                all_listings.append(listing)
                new_count += 1
-
-        zoopla_pages_scraped.labels(channel=channel_label).inc()
+                if max_properties is not None and len(all_listings) >= max_properties:
+                    return all_listings[:max_properties]

        if new_count == 0:
            break  # No new listings on this page
@ -692,31 +624,8 @@ def _map_property_type(raw_type: str | None) -> str:
    return "Other"


-def _detect_rent_frequency(price_text: str) -> str:
-    """Detect rent frequency from Zoopla price text.
-
-    Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
-    '£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
-
-    Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
-    often display both monthly and weekly prices in the same text. When the
-    JS extraction falls back to full card text, checking pcm first ensures
-    the captured monthly price gets the correct frequency label.
-    """
-    lower = price_text.lower()
-    if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
-        return "monthly"
-    if "pw" in lower or "per week" in lower or "/w" in lower:
-        return "weekly"
-    if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
-        return "yearly"
-    # No indicator — default monthly (Zoopla standard)
-    return "monthly"
-
-
 def transform_property(
    raw: dict,
-    channel: str,
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    search_outcode: str | None = None,
@ -783,13 +692,6 @@ def transform_property(
    if listing_url and not listing_url.startswith("http"):
        listing_url = ZOOPLA_BASE + listing_url

-    # Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
-    if channel == "BUY":
-        frequency = ""
-    else:
-        price_text = raw.get("price_text", "")
-        frequency = _detect_rent_frequency(price_text)
-
    return {
        "id": f"zp_{listing_id}",
        "Bedrooms": bedrooms,
@ -803,7 +705,7 @@ def transform_property(
        "Property type": _map_property_type(raw.get("property_type")),
        "Property sub-type": normalize_sub_type(raw.get("property_type")),
        "price": int(price),
-        "price_frequency": frequency,
+        "price_frequency": "",
        "Price qualifier": "",
        "Total floor area (sqm)": floor_area_sqm,
        "Listing URL": listing_url,
@ -820,10 +722,9 @@ def transform_property(
 def search_outcode(
    page,
    outcode: str,
-    channel: str,
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
-    base_search_url: str | None = None,
+    max_properties: int | None = None,
 ) -> tuple[list[dict], str | None]:
    """Search Zoopla for properties in one outcode.

@ -831,47 +732,37 @@ def search_outcode(
    search flow, extracts listings from rendered DOM, and transforms to the
    standard output schema.

-    If base_search_url is provided (from a previous channel search for the same
-    outcode), tries direct URL navigation first — skipping the slow homepage
-    search flow. Falls back to full navigation if direct fails.
-
-    Returns (properties, search_url) where search_url can be passed to the next
-    channel call for this outcode.
+    Returns (properties, search_url).

    Raises TurnstileError if Cloudflare blocks us mid-session.
    """
-    navigated = False
-    if base_search_url:
-        navigated = _navigate_direct(page, base_search_url)
-        if navigated:
-            log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
-
-    if not navigated:
-        if not _navigate_search(page, outcode, channel):
-            return [], None
+    if not _navigate_search(page, outcode):
+        return [], None

    total_results = _get_result_count(page)

    # Always try extraction even if result count is 0 — the count regex may
    # not match Zoopla's current text format, but listings may still be in DOM
-    raw_listings = _paginate(page, max(total_results, 25), channel)
+    raw_listings = _paginate(
+        page,
+        max(total_results, 25),
+        max_properties=max_properties,
+    )
    if not raw_listings:
        if total_results > 0:
            log.debug(
                "Zoopla %s %s: page claims %d results but extraction found 0 — "
                "DOM selectors may need updating",
-                outcode, channel, total_results,
+                outcode, "BUY", total_results,
            )
        return [], None

-    channel_label = "buy" if channel == "BUY" else "rent"
    properties = []
    dropped = 0
    for raw in raw_listings:
-        transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
+        transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
        if transformed:
            properties.append(transformed)
-            zoopla_properties_scraped.labels(channel=channel_label).inc()
        else:
            dropped += 1

@ -881,13 +772,13 @@ def search_outcode(
        log.debug(
            "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
            "(no price/postcode/coords). Sample raw: price=%s address=%r",
-            outcode, channel, len(raw_listings), dropped,
+            outcode, "BUY", len(raw_listings), dropped,
            sample.get("price"), sample.get("address", ""),
        )
    elif dropped > len(raw_listings) // 2:
        log.debug(
            "Zoopla %s %s: %d/%d listings dropped in transform",
-            outcode, channel, dropped, len(raw_listings),
+            outcode, "BUY", dropped, len(raw_listings),
        )

    return properties, page.url