Fix zoopla

2026-03-24 08:12:23 +00:00 · 2026-03-24 08:12:23 +00:00 · 4f61c702b1
commit 4f61c702b1
parent 13980a2887
2 changed files with 186 additions and 42 deletions
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -41,17 +41,23 @@ class TurnstileError(Exception):
 MAX_PAGES_PER_OUTCODE = 10

 # JavaScript to extract listings from the rendered DOM.
-# Finds all detail links, walks up to the card container, and parses
-# price, beds, baths, floor area, address, and tenure from the card text.
+# Uses data-testid attributes as primary selectors (stable across deployments),
+# then falls back to href-based link matching with parent-walking.
 _EXTRACT_LISTINGS_JS = r"""() => {
-    const links = Array.from(document.querySelectorAll(
-        'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
-    ));
-
    const seen = new Set();
    const results = [];

-    for (const link of links) {
+    // Strategy 1: Use data-testid selectors (post-2025 redesign)
+    const listingCards = document.querySelectorAll(
+        '[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
+    );
+
+    for (const card of listingCards) {
+        const link = card.querySelector(
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+        );
+        if (!link) continue;
+
        const href = link.href;
        const match = href.match(/\/details\/(\d+)\//);
        if (!match) continue;
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
        if (seen.has(id)) continue;
        seen.add(id);

-        // Walk up to the listing card container
-        let card = link;
-        for (let j = 0; j < 10; j++) {
-            card = card.parentElement;
-            if (!card) break;
-            const text = card.innerText || '';
-            if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
-                break;
+        const text = card.innerText || '';
+
+        // Try data-testid price element first, then regex
+        const priceEl = card.querySelector('[data-testid="listing-price"]');
+        const priceText = priceEl ? priceEl.innerText : text;
+        const priceMatch = priceText.match(/\u00a3([\d,]+)/);
+
+        // Try address element first, then regex
+        const addressEl = card.querySelector('address');
+        let address = addressEl ? addressEl.innerText.trim() : '';
+
+        if (!address) {
+            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+            for (const line of lines) {
+                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+                    address = line;
+                    break;
+                }
            }
        }
-        if (!card) continue;

-        const text = card.innerText || '';
-        const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
-
-        const priceMatch = text.match(/\u00a3([\d,]+)/);
        const bedsMatch = text.match(/(\d+)\s*beds?/i);
        const bathsMatch = text.match(/(\d+)\s*baths?/i);
        const recMatch = text.match(/(\d+)\s*reception/i);
-        const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
-
-        let address = '';
-        for (const line of lines) {
-            if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
-                (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
-                address = line;
-                break;
-            }
-        }
+        const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);

        let tenure = '';
        if (/freehold/i.test(text)) tenure = 'Freehold';
        else if (/leasehold/i.test(text)) tenure = 'Leasehold';

        results.push({
-            id: id,
-            url: href.replace(window.location.origin, ''),
+            id, url: href.replace(window.location.origin, ''),
            price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
            beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
            baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
            receptions: recMatch ? parseInt(recMatch[1]) : null,
            floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
-            address: address,
-            tenure: tenure,
+            address, tenure,
        });
    }

+    // Strategy 2: Fall back to href-based link matching with parent-walking
+    if (results.length === 0) {
+        const links = Array.from(document.querySelectorAll(
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+        ));
+
+        for (const link of links) {
+            const href = link.href;
+            const match = href.match(/\/details\/(\d+)\//);
+            if (!match) continue;
+
+            const id = match[1];
+            if (seen.has(id)) continue;
+            seen.add(id);
+
+            let card = link;
+            for (let j = 0; j < 15; j++) {
+                card = card.parentElement;
+                if (!card) break;
+                const t = card.innerText || '';
+                if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
+                    break;
+                }
+            }
+            if (!card) continue;
+
+            const text = card.innerText || '';
+            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+
+            const priceMatch = text.match(/\u00a3([\d,]+)/);
+            const bedsMatch = text.match(/(\d+)\s*beds?/i);
+            const bathsMatch = text.match(/(\d+)\s*baths?/i);
+            const recMatch = text.match(/(\d+)\s*reception/i);
+            const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
+
+            let address = '';
+            for (const line of lines) {
+                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+                    address = line;
+                    break;
+                }
+            }
+
+            let tenure = '';
+            if (/freehold/i.test(text)) tenure = 'Freehold';
+            else if (/leasehold/i.test(text)) tenure = 'Leasehold';
+
+            results.push({
+                id, url: href.replace(window.location.origin, ''),
+                price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
+                beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
+                baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
+                receptions: recMatch ? parseInt(recMatch[1]) : null,
+                floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
+                address, tenure,
+            });
+        }
+    }
+
    return results;
 }"""

@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
    else:
        search_input.press("Enter")

-    # Wait for results to load
-    time.sleep(6)
+    # Wait for results to load — try waiting for listings container, fall back to fixed wait
+    try:
+        page.wait_for_selector(
+            '[data-testid="regular-listings"], a[href*="/details/"]',
+            timeout=10000,
+        )
+    except Exception:
+        time.sleep(4)
    _ensure_not_challenged(page)

    return True


 def _get_result_count(page) -> int:
-    """Extract the total results count from the page body text."""
+    """Extract the total results count from the page.
+
+    Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
+    matching both "N results" and "N properties" patterns."""
+    try:
+        # Try the ZAD targeting JSON script tag first
+        count = page.evaluate("""() => {
+            const s = document.querySelector('#__ZAD_TARGETING__');
+            if (s) {
+                try {
+                    const d = JSON.parse(s.textContent);
+                    if (d.search_results_count != null) return d.search_results_count;
+                } catch(e) {}
+            }
+            return null;
+        }""")
+        if count is not None and count > 0:
+            return count
+    except Exception:
+        pass
+
    try:
        body = page.inner_text("body")
-        match = re.search(r"([\d,]+)\s+results?", body)
+        match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
        if match:
            return int(match.group(1).replace(",", ""))
    except Exception:
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
 # ---------------------------------------------------------------------------


+_first_extraction_logged = False
+
+
 def _extract_listings(page) -> list[dict]:
    """Extract listing data from the current search results page DOM."""
+    global _first_extraction_logged
    try:
-        return page.evaluate(_EXTRACT_LISTINGS_JS)
+        listings = page.evaluate(_EXTRACT_LISTINGS_JS)
+
+        # Log diagnostic info on the very first extraction attempt
+        if not _first_extraction_logged:
+            _first_extraction_logged = True
+            try:
+                diag = page.evaluate("""() => {
+                    const details = document.querySelectorAll('a[href*="/details/"]');
+                    const testids = document.querySelectorAll('[data-testid]');
+                    const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
+                    return {
+                        url: location.href,
+                        title: document.title,
+                        detailLinks: details.length,
+                        testids: testidNames.slice(0, 30),
+                        bodySnippet: document.body?.innerText?.slice(0, 500) || '',
+                    };
+                }""")
+                log.info(
+                    "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
+                    "testids=%s bodySnippet=%.200s",
+                    diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
+                    diag.get("testids", []), diag.get("bodySnippet", ""),
+                )
+            except Exception:
+                pass
+            log.info("Zoopla first extraction: %d listings found", len(listings))
+
+        return listings
    except Exception as e:
        log.warning("Failed to extract listings from DOM: %s", e)
        zoopla_errors_total.labels(type="extract_failed").inc()
@ -502,19 +621,40 @@ def search_outcode(
        return []

    total_results = _get_result_count(page)
-    if total_results == 0:
-        return []

-    raw_listings = _paginate(page, total_results, channel)
+    # Always try extraction even if result count is 0 — the count regex may
+    # not match Zoopla's current text format, but listings may still be in DOM
+    raw_listings = _paginate(page, max(total_results, 25), channel)
    if not raw_listings:
+        if total_results > 0:
+            log.debug(
+                "Zoopla %s %s: page claims %d results but extraction found 0 — "
+                "DOM selectors may need updating",
+                outcode, channel, total_results,
+            )
        return []

    channel_label = "buy" if channel == "BUY" else "rent"
    properties = []
+    dropped = 0
    for raw in raw_listings:
        transformed = transform_property(raw, channel, pc_index, pc_coords)
        if transformed:
            properties.append(transformed)
            zoopla_properties_scraped.labels(channel=channel_label).inc()
+        else:
+            dropped += 1
+
+    if dropped and not properties:
+        log.debug(
+            "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
+            "(no price/postcode/coords)",
+            outcode, channel, len(raw_listings), dropped,
+        )
+    elif dropped > len(raw_listings) // 2:
+        log.debug(
+            "Zoopla %s %s: %d/%d listings dropped in transform",
+            outcode, channel, dropped, len(raw_listings),
+        )

    return properties