More fixes

2026-03-18 22:46:08 +00:00 · 2026-03-18 22:46:08 +00:00 · 6b12e21d50
commit 6b12e21d50
parent 15fa09430b
54 changed files with 1665 additions and 630 deletions
--- a/finder/Dockerfile
+++ b/finder/Dockerfile
@ -5,9 +5,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 WORKDIR /app
 COPY pyproject.toml ./
 RUN uv pip install --system -r pyproject.toml
-RUN playwright install --with-deps chromium
+RUN playwright install-deps firefox
+RUN camoufox fetch \
+    && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"

 COPY *.py ./
 COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet

+HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"
+
 CMD ["python3", "main.py"]
--- a/finder/constants.py
+++ b/finder/constants.py
@ -31,6 +31,11 @@ SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
    "true",
    "yes",
 )
+SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
+    "1",
+    "true",
+    "yes",
+)

 # URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
 RELOAD_URL = os.environ.get("RELOAD_URL", "")
@ -47,6 +52,9 @@ HOMECOUK_PER_PAGE = 30  # max supported by the API
 # OpenRent
 OPENRENT_BASE = "https://www.openrent.co.uk"

+# Zoopla
+ZOOPLA_BASE = "https://www.zoopla.co.uk"
+
 PROPERTY_TYPE_MAP = {
    "Detached": "Detached",
    "Semi-Detached": "Semi-Detached",
--- a/finder/main.py
+++ b/finder/main.py
@ -14,6 +14,7 @@ from constants import (
    SCRAPE_HOMECOUK,
    SCRAPE_OPENRENT,
    SCRAPE_RIGHTMOVE,
+    SCRAPE_ZOOPLA,
 )
 from homecouk import load_cookies as load_homecouk_cookies
 from openrent import load_cookies as load_openrent_cookies
@ -48,6 +49,16 @@ log.setLevel(logging.DEBUG)
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("httpcore").setLevel(logging.WARNING)

+
+# Suppress noisy /metrics and /health request logs from werkzeug
+class _NoiseFilter(logging.Filter):
+    def filter(self, record):
+        msg = record.getMessage()
+        return "GET /metrics" not in msg and "GET /health" not in msg
+
+
+logging.getLogger("werkzeug").addFilter(_NoiseFilter())
+
 # ---------------------------------------------------------------------------
 # Startup: load data
 # ---------------------------------------------------------------------------
@ -55,13 +66,14 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
 log.info("Loading arcgis data...")
 OUTCODES = load_outcodes()
 PC_INDEX = build_postcode_index()
-PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
+PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
 log.info(
-    "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
+    "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
    len(OUTCODES),
    SCRAPE_RIGHTMOVE,
    SCRAPE_HOMECOUK,
    SCRAPE_OPENRENT,
+    SCRAPE_ZOOPLA,
 )

 # ---------------------------------------------------------------------------
@ -121,6 +133,11 @@ if SCHEDULE_HOUR >= 0:
 app = Flask(__name__)


+@app.route("/health")
+def health():
+    return "ok", 200
+
+
@app.route("/run", methods=["POST"])
 def trigger_run():
    if _start_scrape():
@ -147,6 +164,7 @@ def get_status():
                "rightmove": status.rm_properties,
                "homecouk": status.hk_properties,
                "openrent": status.or_properties,
+                "zoopla": status.zp_properties,
            },
            "errors": status.errors[-20:],  # last 20 errors
            "elapsed_seconds": round(elapsed, 1),
@ -167,8 +185,10 @@ def get_debug():
            "scrape_rightmove": SCRAPE_RIGHTMOVE,
            "scrape_homecouk": SCRAPE_HOMECOUK,
            "scrape_openrent": SCRAPE_OPENRENT,
+            "scrape_zoopla": SCRAPE_ZOOPLA,
            "homecouk_cookies_available": hk_cookies is not None,
            "openrent_cookies_available": or_cookies is not None,
+            "zoopla_note": "browser-based (Camoufox), no cookies needed",
        }
    )

--- a/finder/metrics.py
+++ b/finder/metrics.py
@ -109,6 +109,28 @@ openrent_properties_scraped = Counter(
    ["channel"],
 )

+# ---------------------------------------------------------------------------
+# Counters — Zoopla
+# ---------------------------------------------------------------------------
+
+zoopla_pages_scraped = Counter(
+    "zoopla_pages_scraped",
+    "Search result pages scraped from Zoopla",
+    ["channel"],
+)
+
+zoopla_errors_total = Counter(
+    "zoopla_errors_total",
+    "Zoopla scraping errors",
+    ["type"],
+)
+
+zoopla_properties_scraped = Counter(
+    "zoopla_properties_scraped",
+    "Properties scraped from Zoopla (before dedup)",
+    ["channel"],
+)
+
 # ---------------------------------------------------------------------------
 # Counters — FlareSolverr / cookie management
 # ---------------------------------------------------------------------------
@ -138,3 +160,8 @@ openrent_enabled = Gauge(
    "openrent_enabled",
    "Whether OpenRent scraping is currently active (1=yes, 0=no)",
 )
+
+zoopla_enabled = Gauge(
+    "zoopla_enabled",
+    "Whether Zoopla scraping is currently active (1=yes, 0=no)",
+)
--- a/finder/scraper.py
+++ b/finder/scraper.py
@ -17,6 +17,7 @@ from constants import (
    SCRAPE_HOMECOUK,
    SCRAPE_OPENRENT,
    SCRAPE_RIGHTMOVE,
+    SCRAPE_ZOOPLA,
    SEED,
 )
 from homecouk import CookiesExpiredError
@ -35,12 +36,16 @@ from metrics import (
    scrape_outcodes_total,
    scrape_properties_total,
    scrape_state,
+    zoopla_enabled,
 )
 from openrent import WafChallengeError
 from openrent import load_cookies as load_openrent_cookies
 from openrent import make_client as make_openrent_client
 from openrent import search_outcode as openrent_search_outcode
 from rightmove import resolve_outcode_id, search_outcode
+from zoopla import TurnstileError
+from zoopla import launch_browser as launch_zoopla_browser
+from zoopla import search_outcode as zoopla_search_outcode
 from spatial import PostcodeSpatialIndex
 from storage import write_parquet

@ -60,6 +65,7 @@ class ScrapeStatus:
    rm_properties: int = 0
    hk_properties: int = 0
    or_properties: int = 0
+    zp_properties: int = 0
    errors: list[str] = field(default_factory=list)
    started_at: float = 0.0
    finished_at: float = 0.0
@ -93,6 +99,9 @@ def _sync_gauges() -> None:
    scrape_properties_total.labels(channel=ch, source="openrent").set(
        status.or_properties
    )
+    scrape_properties_total.labels(channel=ch, source="zoopla").set(
+        status.zp_properties
+    )
    if status.started_at:
        end = status.finished_at if status.finished_at else time.time()
        scrape_elapsed_seconds.set(end - status.started_at)
@ -191,7 +200,7 @@ def run_scrape(
    random.seed(SEED)
    random.shuffle(shuffled)

-    if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
+    if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
        log.warning("All scrapers disabled — nothing to do")
        with status_lock:
            status.state = "done"
@ -239,8 +248,27 @@ def run_scrape(
            )
            openrent_enabled.set(0)

-    # Build postcode coords if OpenRent is active and caller didn't provide them
-    if or_client and pc_coords is None:
+    # Zoopla: uses Camoufox browser (no cookies/client pattern)
+    zp_browser = None
+    zp_page = None
+    zp_failed = False
+    if not SCRAPE_ZOOPLA:
+        log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
+        zoopla_enabled.set(0)
+    else:
+        try:
+            zp_browser, zp_page = launch_zoopla_browser()
+            log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
+            zoopla_enabled.set(1)
+        except TurnstileError:
+            log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
+            zoopla_enabled.set(0)
+        except Exception as e:
+            log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
+            zoopla_enabled.set(0)
+
+    # Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
+    if (or_client or zp_page) and pc_coords is None:
        pc_coords = build_postcode_coords()

    try:
@ -256,6 +284,8 @@ def run_scrape(
            hk_dedup_count = 0  # home.co.uk skipped as cross-source duplicates
            or_count = 0  # OpenRent properties this channel
            or_dedup_count = 0  # OpenRent skipped as cross-source duplicates
+            zp_count = 0  # Zoopla properties this channel
+            zp_dedup_count = 0  # Zoopla skipped as cross-source duplicates

            with status_lock:
                status.channel = channel_name
@ -264,6 +294,7 @@ def run_scrape(
                status.rm_properties = 0
                status.hk_properties = 0
                status.or_properties = 0
+                status.zp_properties = 0

            channel_start = time.time()
            prev_prop_milestone = 0  # last 10k milestone we logged
@ -412,6 +443,63 @@ def run_scrape(
                        with status_lock:
                            status.errors.append(msg)

+                # --- Zoopla ---
+                if zp_page and not zp_failed:
+                    made_requests = True
+                    try:
+                        zp_props = zoopla_search_outcode(
+                            zp_page,
+                            outcode,
+                            channel_name,
+                            pc_index,
+                            pc_coords,
+                        )
+                        for p in zp_props:
+                            pid = p["id"]
+                            key = _dedup_key(p)
+                            if pid in all_properties or key in seen_dedup_keys:
+                                zp_dedup_count += 1
+                                cross_source_dedup_total.labels(
+                                    channel="buy" if channel_name == "BUY" else "rent",
+                                ).inc()
+                                continue
+                            all_properties[pid] = p
+                            seen_dedup_keys.add(key)
+                            zp_count += 1
+                        if zp_props:
+                            log.info(
+                                "Zoopla %s: +%d properties", outcode, len(zp_props)
+                            )
+                    except TurnstileError:
+                        log.warning(
+                            "Zoopla Cloudflare challenge failed — attempting browser relaunch"
+                        )
+                        try:
+                            zp_browser.close()
+                        except Exception:
+                            pass
+                        try:
+                            zp_browser, zp_page = launch_zoopla_browser()
+                            log.info("Zoopla browser relaunched, continuing")
+                        except Exception:
+                            log.warning(
+                                "Browser relaunch failed, disabling Zoopla for rest of scrape"
+                            )
+                            zp_page = None
+                            zp_browser = None
+                            zp_failed = True
+                            zoopla_enabled.set(0)
+                            with status_lock:
+                                status.errors.append(
+                                    "Zoopla Cloudflare challenge failed and browser relaunch failed"
+                                )
+                    except Exception as e:
+                        msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
+                        log.error(msg)
+                        scrape_errors_total.labels(source="zoopla").inc()
+                        with status_lock:
+                            status.errors.append(msg)
+
                with status_lock:
                    if channel_name == "BUY":
                        status.properties_buy = len(all_properties)
@ -420,6 +508,7 @@ def run_scrape(
                    status.rm_properties = rm_count
                    status.hk_properties = hk_count
                    status.or_properties = or_count
+                    status.zp_properties = zp_count
                    _sync_gauges()

                # Log progress every 100 outcodes
@ -444,12 +533,13 @@ def run_scrape(
                if current_milestone > prev_prop_milestone:
                    prev_prop_milestone = current_milestone
                    log.info(
-                        "%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
+                        "%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
                        channel_name,
                        current_milestone * 10,
                        rm_count,
                        hk_count,
                        or_count,
+                        zp_count,
                        done,
                        len(shuffled),
                        _fmt_elapsed(elapsed),
@ -472,13 +562,14 @@ def run_scrape(
                _sync_gauges()

            log.info(
-                "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
+                "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
                channel_name,
                len(deduped),
                rm_count,
                hk_count,
                or_count,
-                hk_dedup_count + or_dedup_count,
+                zp_count,
+                hk_dedup_count + or_dedup_count + zp_dedup_count,
            )

        with status_lock:
@ -525,3 +616,8 @@ def run_scrape(
            hk_client.close()
        if or_client:
            or_client.close()
+        if zp_browser:
+            try:
+                zp_browser.close()
+            except Exception:
+                pass
--- a/finder/storage.py
+++ b/finder/storage.py
@ -25,7 +25,11 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
        if fvd:
            try:
                dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
-                listing_dates.append(dt.replace(tzinfo=None))
+                # Convert to UTC naive datetime for consistent storage
+                if dt.tzinfo is not None:
+                    from datetime import timezone
+                    dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
+                listing_dates.append(dt)
            except (ValueError, TypeError):
                listing_dates.append(None)
        else:
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -0,0 +1,520 @@
+"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
+
+Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
+blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
+patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
+
+Zoopla uses Next.js App Router with React Server Components (RSC). Search
+result data is server-rendered in an RSC stream, not available via
+__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
+the working flow requires typing into the autocomplete input, selecting a
+suggestion, and clicking Search.
+
+Architecture:
+  Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
+  a single Camoufox browser alive for the entire scrape. For each outcode, it:
+    1. Clears and types the outcode into the search input
+    2. Selects the first autocomplete suggestion
+    3. Clicks Search
+    4. Extracts listing data from the rendered DOM
+    5. Handles pagination via ?pn=N parameter
+
+  The browser session replaces the cookie/client pattern used by other scrapers.
+"""
+
+import logging
+import re
+import time
+
+from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
+from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
+from spatial import PostcodeSpatialIndex
+
+log = logging.getLogger("zoopla")
+
+
+class TurnstileError(Exception):
+    """Raised when Cloudflare Turnstile challenge cannot be passed."""
+
+
+# Maximum search result pages to scrape per outcode (25 listings/page)
+MAX_PAGES_PER_OUTCODE = 10
+
+# JavaScript to extract listings from the rendered DOM.
+# Finds all detail links, walks up to the card container, and parses
+# price, beds, baths, floor area, address, and tenure from the card text.
+_EXTRACT_LISTINGS_JS = r"""() => {
+    const links = Array.from(document.querySelectorAll(
+        'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+    ));
+
+    const seen = new Set();
+    const results = [];
+
+    for (const link of links) {
+        const href = link.href;
+        const match = href.match(/\/details\/(\d+)\//);
+        if (!match) continue;
+
+        const id = match[1];
+        if (seen.has(id)) continue;
+        seen.add(id);
+
+        // Walk up to the listing card container
+        let card = link;
+        for (let j = 0; j < 10; j++) {
+            card = card.parentElement;
+            if (!card) break;
+            const text = card.innerText || '';
+            if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
+                break;
+            }
+        }
+        if (!card) continue;
+
+        const text = card.innerText || '';
+        const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+
+        const priceMatch = text.match(/\u00a3([\d,]+)/);
+        const bedsMatch = text.match(/(\d+)\s*beds?/i);
+        const bathsMatch = text.match(/(\d+)\s*baths?/i);
+        const recMatch = text.match(/(\d+)\s*reception/i);
+        const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
+
+        let address = '';
+        for (const line of lines) {
+            if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+                (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+                address = line;
+                break;
+            }
+        }
+
+        let tenure = '';
+        if (/freehold/i.test(text)) tenure = 'Freehold';
+        else if (/leasehold/i.test(text)) tenure = 'Leasehold';
+
+        results.push({
+            id: id,
+            url: href.replace(window.location.origin, ''),
+            price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
+            beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
+            baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
+            receptions: recMatch ? parseInt(recMatch[1]) : null,
+            floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
+            address: address,
+            tenure: tenure,
+        });
+    }
+
+    return results;
+}"""
+
+# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
+_DISMISS_COOKIES_JS = """() => {
+    const aside = document.querySelector('#usercentrics-cmp-ui');
+    if (aside && aside.shadowRoot) {
+        const btns = aside.shadowRoot.querySelectorAll('button');
+        for (const btn of btns) {
+            if (btn.innerText.includes('Accept')) { btn.click(); return true; }
+        }
+    }
+    if (aside) { aside.remove(); return true; }
+    return false;
+}"""
+
+
+# ---------------------------------------------------------------------------
+# Browser lifecycle
+# ---------------------------------------------------------------------------
+
+
+def launch_browser():
+    """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
+    and dismiss cookie consent. Returns (browser, page) tuple.
+
+    Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
+    Caller must close browser when done."""
+    from camoufox.pkgman import camoufox_path
+
+    # Verify camoufox is pre-installed — never download at runtime
+    camoufox_path(download_if_missing=False)
+
+    from camoufox.sync_api import Camoufox
+
+    log.info("Launching Camoufox browser for Zoopla...")
+    browser = Camoufox(headless=True).__enter__()
+    page = browser.new_page()
+
+    log.info("Navigating to Zoopla homepage...")
+    page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
+
+    # Wait for Cloudflare Turnstile to resolve.
+    # Try clicking the Turnstile checkbox if present (helps in some cases).
+    for i in range(20):
+        if "Just a moment" not in page.title():
+            break
+        # Attempt to click the Turnstile checkbox in the challenge iframe
+        for frame in page.frames:
+            if "challenges.cloudflare.com" in frame.url:
+                try:
+                    iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
+                    if iframe_el:
+                        box = iframe_el.bounding_box()
+                        if box:
+                            page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
+                except Exception:
+                    pass
+                break
+        time.sleep(3)
+    else:
+        page.close()
+        browser.close()
+        raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
+
+    log.info("Cloudflare passed — title: %s", page.title())
+    time.sleep(2)
+
+    # Dismiss cookie consent
+    page.evaluate(_DISMISS_COOKIES_JS)
+    time.sleep(1)
+
+    return browser, page
+
+
+def _ensure_not_challenged(page) -> None:
+    """Check if current page is a Cloudflare challenge and wait/raise."""
+    if "Just a moment" not in page.title():
+        return
+
+    log.warning("Cloudflare challenge detected mid-session, waiting...")
+    for i in range(20):
+        time.sleep(3)
+        if "Just a moment" not in page.title():
+            log.info("Cloudflare challenge resolved")
+            return
+
+    raise TurnstileError("Cloudflare re-challenge did not resolve")
+
+
+# ---------------------------------------------------------------------------
+# Search navigation
+# ---------------------------------------------------------------------------
+
+
+def _navigate_search(page, outcode: str, channel: str) -> bool:
+    """Navigate to search results for an outcode via the homepage search flow.
+
+    Returns True if results were found, False if no results or navigation failed.
+    Raises TurnstileError if Cloudflare blocks us."""
+    # Navigate to homepage to reset search state
+    page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
+    time.sleep(2)
+    _ensure_not_challenged(page)
+
+    # Dismiss cookie consent (may reappear after navigation)
+    page.evaluate(_DISMISS_COOKIES_JS)
+    time.sleep(1)
+
+    # Select Buy/Rent tab
+    if channel == "RENT":
+        rent_tab = page.query_selector(
+            'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
+        )
+        if rent_tab:
+            rent_tab.click()
+            time.sleep(0.5)
+
+    # Find and fill search input
+    search_input = page.query_selector(
+        'input[name="autosuggest-input"]'
+    ) or page.query_selector('input[type="text"]')
+    if not search_input:
+        log.warning("Could not find search input on homepage")
+        return False
+
+    search_input.click()
+    time.sleep(0.3)
+    search_input.fill("")
+    search_input.type(outcode, delay=60)
+    time.sleep(2)
+
+    # Select first autocomplete suggestion
+    first_option = page.query_selector('[role="option"]')
+    if not first_option:
+        log.debug("No autocomplete suggestions for outcode %s", outcode)
+        return False
+
+    first_option.click()
+    time.sleep(0.5)
+
+    # Click search button
+    search_btn = page.query_selector('button:has-text("Search")')
+    if search_btn:
+        search_btn.click()
+    else:
+        search_input.press("Enter")
+
+    # Wait for results to load
+    time.sleep(6)
+    _ensure_not_challenged(page)
+
+    return True
+
+
+def _get_result_count(page) -> int:
+    """Extract the total results count from the page body text."""
+    try:
+        body = page.inner_text("body")
+        match = re.search(r"([\d,]+)\s+results?", body)
+        if match:
+            return int(match.group(1).replace(",", ""))
+    except Exception:
+        pass
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Extraction and pagination
+# ---------------------------------------------------------------------------
+
+
+def _extract_listings(page) -> list[dict]:
+    """Extract listing data from the current search results page DOM."""
+    try:
+        return page.evaluate(_EXTRACT_LISTINGS_JS)
+    except Exception as e:
+        log.warning("Failed to extract listings from DOM: %s", e)
+        zoopla_errors_total.labels(type="extract_failed").inc()
+        return []
+
+
+def _paginate(page, total_results: int, channel: str) -> list[dict]:
+    """Extract listings from all pages of search results.
+
+    Page 1 is already loaded. For subsequent pages, clicks the Next button
+    or navigates via URL parameter ?pn=N."""
+    all_listings = _extract_listings(page)
+    channel_label = "buy" if channel == "BUY" else "rent"
+    zoopla_pages_scraped.labels(channel=channel_label).inc()
+
+    if not all_listings or total_results <= len(all_listings):
+        return all_listings
+
+    seen_ids = {l["id"] for l in all_listings}
+    current_url = page.url
+    page_num = 2
+
+    while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
+        time.sleep(DELAY_BETWEEN_PAGES)
+
+        # Try navigating via URL parameter
+        if "?" in current_url:
+            next_url = re.sub(r"[?&]pn=\d+", "", current_url)
+            separator = "&" if "?" in next_url else "?"
+            next_url = f"{next_url}{separator}pn={page_num}"
+        else:
+            next_url = f"{current_url}?pn={page_num}"
+
+        try:
+            page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
+            time.sleep(4)
+            _ensure_not_challenged(page)
+        except TurnstileError:
+            raise
+        except Exception as e:
+            log.debug("Pagination navigation failed at page %d: %s", page_num, e)
+            break
+
+        page_listings = _extract_listings(page)
+        if not page_listings:
+            break
+
+        # Deduplicate within this outcode
+        new_count = 0
+        for listing in page_listings:
+            if listing["id"] not in seen_ids:
+                seen_ids.add(listing["id"])
+                all_listings.append(listing)
+                new_count += 1
+
+        zoopla_pages_scraped.labels(channel=channel_label).inc()
+
+        if new_count == 0:
+            break  # No new listings on this page
+
+        page_num += 1
+
+    return all_listings
+
+
+# ---------------------------------------------------------------------------
+# Property transformation
+# ---------------------------------------------------------------------------
+
+
+def _extract_postcode(text: str) -> str | None:
+    """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
+    match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
+    if match:
+        return match.group(1).upper().strip()
+    return None
+
+
+def _extract_outcode(text: str) -> str | None:
+    """Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
+    # Look for outcode at end of string or after last comma
+    match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Try after comma
+    parts = text.split(",")
+    if len(parts) > 1:
+        last = parts[-1].strip()
+        match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+    return None
+
+
+def _map_property_type(raw_type: str | None) -> str:
+    """Map Zoopla property type text to canonical type."""
+    if not raw_type:
+        return "Other"
+    canonical = PROPERTY_TYPE_MAP.get(raw_type)
+    if canonical:
+        return canonical
+    lower = raw_type.lower()
+    if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
+        return "Flats/Maisonettes"
+    if "detached" in lower and "semi" not in lower:
+        return "Detached"
+    if "semi" in lower:
+        return "Semi-Detached"
+    if "terrace" in lower or "mews" in lower:
+        return "Terraced"
+    if "house" in lower:
+        return "Detached"
+    return "Other"
+
+
+def transform_property(
+    raw: dict,
+    channel: str,
+    pc_index: PostcodeSpatialIndex,
+    pc_coords: dict[str, tuple[float, float]],
+) -> dict | None:
+    """Transform a raw Zoopla listing dict into the standard output schema.
+
+    Zoopla search cards do not include coordinates, so we resolve lat/lng
+    from postcodes extracted from the address text."""
+    price = raw.get("price")
+    if not price:
+        return None
+
+    address = raw.get("address", "")
+
+    # Resolve postcode and coordinates from address
+    postcode = _extract_postcode(address)
+    lat = lng = None
+
+    if postcode:
+        coords = pc_coords.get(postcode)
+        if coords:
+            lat, lng = coords
+
+    if lat is None:
+        # Try outcode-level fallback
+        outcode = _extract_outcode(address)
+        if outcode:
+            prefix = outcode + " "
+            for pcd, coords in pc_coords.items():
+                if pcd.startswith(prefix):
+                    postcode = pcd
+                    lat, lng = coords
+                    break
+
+    if lat is None or lng is None or not postcode:
+        return None
+
+    # Validate coordinates are in England
+    if not (49 <= lat <= 56 and -7 <= lng <= 2):
+        return None
+
+    bedrooms = raw.get("beds") or 0
+    bathrooms = raw.get("baths") or 0
+    receptions = raw.get("receptions") or 0
+
+    # Floor area: convert sq ft to sq m
+    floor_area_sqm = None
+    sqft = raw.get("floor_area_sqft")
+    if sqft:
+        floor_area_sqm = round(sqft * 0.092903, 1)
+
+    listing_id = raw.get("id", "")
+    listing_url = raw.get("url", "")
+    if listing_url and not listing_url.startswith("http"):
+        listing_url = ZOOPLA_BASE + listing_url
+
+    return {
+        "id": f"zp_{listing_id}",
+        "Bedrooms": bedrooms,
+        "Bathrooms": bathrooms,
+        "Number of bedrooms & living rooms": bedrooms + receptions,
+        "lon": lng,
+        "lat": lat,
+        "Postcode": postcode,
+        "Address per Property Register": address,
+        "Leasehold/Freehold": raw.get("tenure") or None,
+        "Property type": "Other",  # Not reliably extractable from Zoopla search cards
+        "Property sub-type": "",
+        "price": int(price),
+        "price_frequency": "" if channel == "BUY" else "monthly",
+        "Price qualifier": "",
+        "Total floor area (sqm)": floor_area_sqm,
+        "Listing URL": listing_url,
+        "Listing features": [],
+        "first_visible_date": "",
+    }
+
+
+# ---------------------------------------------------------------------------
+# Top-level search function (called by scraper.py)
+# ---------------------------------------------------------------------------
+
+
+def search_outcode(
+    page,
+    outcode: str,
+    channel: str,
+    pc_index: PostcodeSpatialIndex,
+    pc_coords: dict[str, tuple[float, float]],
+) -> list[dict]:
+    """Search Zoopla for properties in one outcode.
+
+    Takes a live Camoufox Page (from launch_browser). Navigates through the
+    search flow, extracts listings from rendered DOM, and transforms to the
+    standard output schema.
+
+    Raises TurnstileError if Cloudflare blocks us mid-session.
+    """
+    if not _navigate_search(page, outcode, channel):
+        return []
+
+    total_results = _get_result_count(page)
+    if total_results == 0:
+        return []
+
+    raw_listings = _paginate(page, total_results, channel)
+    if not raw_listings:
+        return []
+
+    channel_label = "buy" if channel == "BUY" else "rent"
+    properties = []
+    for raw in raw_listings:
+        transformed = transform_property(raw, channel, pc_index, pc_coords)
+        if transformed:
+            properties.append(transformed)
+            zoopla_properties_scraped.labels(channel=channel_label).inc()
+
+    return properties