Fix zoopla

2026-03-24 08:12:23 +00:00 · 2026-03-24 08:12:23 +00:00 · 4f61c702b1
commit 4f61c702b1
parent 13980a2887
2 changed files with 186 additions and 42 deletions
--- a/Dockerfile.finder
+++ b/Dockerfile.finder
@ -5,6 +5,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 WORKDIR /app
 COPY finder/pyproject.toml ./
 RUN uv pip install --system -r pyproject.toml
 RUN playwright install-deps chromium firefox
 RUN playwright install chromium
 RUN camoufox fetch \
    && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
 COPY finder/*.py ./
 COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -41,17 +41,23 @@ class TurnstileError(Exception):
 MAX_PAGES_PER_OUTCODE = 10
 # JavaScript to extract listings from the rendered DOM.
-# Finds all detail links, walks up to the card container, and parses
+# Uses data-testid attributes as primary selectors (stable across deployments),
-# price, beds, baths, floor area, address, and tenure from the card text.
+# then falls back to href-based link matching with parent-walking.
 _EXTRACT_LISTINGS_JS = r"""() => {
    const links = Array.from(document.querySelectorAll(
        'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
    ));
    const seen = new Set();
    const results = [];
-    for (const link of links) {
+    // Strategy 1: Use data-testid selectors (post-2025 redesign)
    const listingCards = document.querySelectorAll(
        '[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
    );
    for (const card of listingCards) {
        const link = card.querySelector(
            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
        );
        if (!link) continue;
        const href = link.href;
        const match = href.match(/\/details\/(\d+)\//);
        if (!match) continue;
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
        if (seen.has(id)) continue;
        seen.add(id);
-        // Walk up to the listing card container
+        const text = card.innerText || '';
-        let card = link;
+
-        for (let j = 0; j < 10; j++) {
+        // Try data-testid price element first, then regex
-            card = card.parentElement;
+        const priceEl = card.querySelector('[data-testid="listing-price"]');
-            if (!card) break;
+        const priceText = priceEl ? priceEl.innerText : text;
-            const text = card.innerText || '';
+        const priceMatch = priceText.match(/\u00a3([\d,]+)/);
-            if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
+
-                break;
+        // Try address element first, then regex
        const addressEl = card.querySelector('address');
        let address = addressEl ? addressEl.innerText.trim() : '';
        if (!address) {
            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
            for (const line of lines) {
                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                    address = line;
                    break;
                }
            }
        }
        if (!card) continue;
        const text = card.innerText || '';
        const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
        const priceMatch = text.match(/\u00a3([\d,]+)/);
        const bedsMatch = text.match(/(\d+)\s*beds?/i);
        const bathsMatch = text.match(/(\d+)\s*baths?/i);
        const recMatch = text.match(/(\d+)\s*reception/i);
-        const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
+        const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
        let address = '';
        for (const line of lines) {
            if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                address = line;
                break;
            }
        }
        let tenure = '';
        if (/freehold/i.test(text)) tenure = 'Freehold';
        else if (/leasehold/i.test(text)) tenure = 'Leasehold';
        results.push({
-            id: id,
+            id, url: href.replace(window.location.origin, ''),
            url: href.replace(window.location.origin, ''),
            price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
            beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
            baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
            receptions: recMatch ? parseInt(recMatch[1]) : null,
            floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
-            address: address,
+            address, tenure,
            tenure: tenure,
        });
    }
    // Strategy 2: Fall back to href-based link matching with parent-walking
    if (results.length === 0) {
        const links = Array.from(document.querySelectorAll(
            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
        ));
        for (const link of links) {
            const href = link.href;
            const match = href.match(/\/details\/(\d+)\//);
            if (!match) continue;
            const id = match[1];
            if (seen.has(id)) continue;
            seen.add(id);
            let card = link;
            for (let j = 0; j < 15; j++) {
                card = card.parentElement;
                if (!card) break;
                const t = card.innerText || '';
                if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
                    break;
                }
            }
            if (!card) continue;
            const text = card.innerText || '';
            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
            const priceMatch = text.match(/\u00a3([\d,]+)/);
            const bedsMatch = text.match(/(\d+)\s*beds?/i);
            const bathsMatch = text.match(/(\d+)\s*baths?/i);
            const recMatch = text.match(/(\d+)\s*reception/i);
            const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
            let address = '';
            for (const line of lines) {
                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                    address = line;
                    break;
                }
            }
            let tenure = '';
            if (/freehold/i.test(text)) tenure = 'Freehold';
            else if (/leasehold/i.test(text)) tenure = 'Leasehold';
            results.push({
                id, url: href.replace(window.location.origin, ''),
                price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
                beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
                baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
                receptions: recMatch ? parseInt(recMatch[1]) : null,
                floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
                address, tenure,
            });
        }
    }
    return results;
 }"""
@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
    else:
        search_input.press("Enter")
-    # Wait for results to load
+    # Wait for results to load — try waiting for listings container, fall back to fixed wait
-    time.sleep(6)
+    try:
        page.wait_for_selector(
            '[data-testid="regular-listings"], a[href*="/details/"]',
            timeout=10000,
        )
    except Exception:
        time.sleep(4)
    _ensure_not_challenged(page)
    return True
 def _get_result_count(page) -> int:
-    """Extract the total results count from the page body text."""
+    """Extract the total results count from the page.
    Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
    matching both "N results" and "N properties" patterns."""
    try:
        # Try the ZAD targeting JSON script tag first
        count = page.evaluate("""() => {
            const s = document.querySelector('#__ZAD_TARGETING__');
            if (s) {
                try {
                    const d = JSON.parse(s.textContent);
                    if (d.search_results_count != null) return d.search_results_count;
                } catch(e) {}
            }
            return null;
        }""")
        if count is not None and count > 0:
            return count
    except Exception:
        pass
    try:
        body = page.inner_text("body")
-        match = re.search(r"([\d,]+)\s+results?", body)
+        match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
        if match:
            return int(match.group(1).replace(",", ""))
    except Exception:
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
 # ---------------------------------------------------------------------------
 _first_extraction_logged = False
 def _extract_listings(page) -> list[dict]:
    """Extract listing data from the current search results page DOM."""
    global _first_extraction_logged
    try:
-        return page.evaluate(_EXTRACT_LISTINGS_JS)
+        listings = page.evaluate(_EXTRACT_LISTINGS_JS)
        # Log diagnostic info on the very first extraction attempt
        if not _first_extraction_logged:
            _first_extraction_logged = True
            try:
                diag = page.evaluate("""() => {
                    const details = document.querySelectorAll('a[href*="/details/"]');
                    const testids = document.querySelectorAll('[data-testid]');
                    const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
                    return {
                        url: location.href,
                        title: document.title,
                        detailLinks: details.length,
                        testids: testidNames.slice(0, 30),
                        bodySnippet: document.body?.innerText?.slice(0, 500) || '',
                    };
                }""")
                log.info(
                    "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
                    "testids=%s bodySnippet=%.200s",
                    diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
                    diag.get("testids", []), diag.get("bodySnippet", ""),
                )
            except Exception:
                pass
            log.info("Zoopla first extraction: %d listings found", len(listings))
        return listings
    except Exception as e:
        log.warning("Failed to extract listings from DOM: %s", e)
        zoopla_errors_total.labels(type="extract_failed").inc()
@ -502,19 +621,40 @@ def search_outcode(
        return []
    total_results = _get_result_count(page)
    if total_results == 0:
        return []
-    raw_listings = _paginate(page, total_results, channel)
+    # Always try extraction even if result count is 0 — the count regex may
    # not match Zoopla's current text format, but listings may still be in DOM
    raw_listings = _paginate(page, max(total_results, 25), channel)
    if not raw_listings:
        if total_results > 0:
            log.debug(
                "Zoopla %s %s: page claims %d results but extraction found 0 — "
                "DOM selectors may need updating",
                outcode, channel, total_results,
            )
        return []
    channel_label = "buy" if channel == "BUY" else "rent"
    properties = []
    dropped = 0
    for raw in raw_listings:
        transformed = transform_property(raw, channel, pc_index, pc_coords)
        if transformed:
            properties.append(transformed)
            zoopla_properties_scraped.labels(channel=channel_label).inc()
        else:
            dropped += 1
    if dropped and not properties:
        log.debug(
            "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
            "(no price/postcode/coords)",
            outcode, channel, len(raw_listings), dropped,
        )
    elif dropped > len(raw_listings) // 2:
        log.debug(
            "Zoopla %s %s: %d/%d listings dropped in transform",
            outcode, channel, dropped, len(raw_listings),
        )
    return properties