diff --git a/Dockerfile.finder b/Dockerfile.finder index 52ef745..c0ab6d6 100644 --- a/Dockerfile.finder +++ b/Dockerfile.finder @@ -5,6 +5,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv WORKDIR /app COPY finder/pyproject.toml ./ RUN uv pip install --system -r pyproject.toml +RUN playwright install-deps chromium firefox +RUN playwright install chromium +RUN camoufox fetch \ + && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)" COPY finder/*.py ./ COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet diff --git a/finder/zoopla.py b/finder/zoopla.py index ecd8a4b..052794f 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -41,17 +41,23 @@ class TurnstileError(Exception): MAX_PAGES_PER_OUTCODE = 10 # JavaScript to extract listings from the rendered DOM. -# Finds all detail links, walks up to the card container, and parses -# price, beds, baths, floor area, address, and tenure from the card text. +# Uses data-testid attributes as primary selectors (stable across deployments), +# then falls back to href-based link matching with parent-walking. _EXTRACT_LISTINGS_JS = r"""() => { - const links = Array.from(document.querySelectorAll( - 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' - )); - const seen = new Set(); const results = []; - for (const link of links) { + // Strategy 1: Use data-testid selectors (post-2025 redesign) + const listingCards = document.querySelectorAll( + '[data-testid="regular-listings"] > div, [data-testid="search-content"] li' + ); + + for (const card of listingCards) { + const link = card.querySelector( + 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' + ); + if (!link) continue; + const href = link.href; const match = href.match(/\/details\/(\d+)\//); if (!match) continue; @@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => { if (seen.has(id)) continue; seen.add(id); - // Walk up to the listing card container - let card = link; - for (let j = 0; j < 10; j++) { - card = card.parentElement; - if (!card) break; - const text = card.innerText || ''; - if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) { - break; + const text = card.innerText || ''; + + // Try data-testid price element first, then regex + const priceEl = card.querySelector('[data-testid="listing-price"]'); + const priceText = priceEl ? priceEl.innerText : text; + const priceMatch = priceText.match(/\u00a3([\d,]+)/); + + // Try address element first, then regex + const addressEl = card.querySelector('address'); + let address = addressEl ? addressEl.innerText.trim() : ''; + + if (!address) { + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + for (const line of lines) { + if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || + (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { + address = line; + break; + } } } - if (!card) continue; - const text = card.innerText || ''; - const lines = text.split('\n').map(l => l.trim()).filter(Boolean); - - const priceMatch = text.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); - const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i); - - let address = ''; - for (const line of lines) { - if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || - (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { - address = line; - break; - } - } + const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); let tenure = ''; if (/freehold/i.test(text)) tenure = 'Freehold'; else if (/leasehold/i.test(text)) tenure = 'Leasehold'; results.push({ - id: id, - url: href.replace(window.location.origin, ''), + id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, beds: bedsMatch ? parseInt(bedsMatch[1]) : null, baths: bathsMatch ? parseInt(bathsMatch[1]) : null, receptions: recMatch ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, - address: address, - tenure: tenure, + address, tenure, }); } + // Strategy 2: Fall back to href-based link matching with parent-walking + if (results.length === 0) { + const links = Array.from(document.querySelectorAll( + 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' + )); + + for (const link of links) { + const href = link.href; + const match = href.match(/\/details\/(\d+)\//); + if (!match) continue; + + const id = match[1]; + if (seen.has(id)) continue; + seen.add(id); + + let card = link; + for (let j = 0; j < 15; j++) { + card = card.parentElement; + if (!card) break; + const t = card.innerText || ''; + if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) { + break; + } + } + if (!card) continue; + + const text = card.innerText || ''; + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + + const priceMatch = text.match(/\u00a3([\d,]+)/); + const bedsMatch = text.match(/(\d+)\s*beds?/i); + const bathsMatch = text.match(/(\d+)\s*baths?/i); + const recMatch = text.match(/(\d+)\s*reception/i); + const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); + + let address = ''; + for (const line of lines) { + if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || + (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { + address = line; + break; + } + } + + let tenure = ''; + if (/freehold/i.test(text)) tenure = 'Freehold'; + else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + + results.push({ + id, url: href.replace(window.location.origin, ''), + price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, + beds: bedsMatch ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch ? parseInt(bathsMatch[1]) : null, + receptions: recMatch ? parseInt(recMatch[1]) : null, + floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, + address, tenure, + }); + } + } + return results; }""" @@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: else: search_input.press("Enter") - # Wait for results to load - time.sleep(6) + # Wait for results to load — try waiting for listings container, fall back to fixed wait + try: + page.wait_for_selector( + '[data-testid="regular-listings"], a[href*="/details/"]', + timeout=10000, + ) + except Exception: + time.sleep(4) _ensure_not_challenged(page) return True def _get_result_count(page) -> int: - """Extract the total results count from the page body text.""" + """Extract the total results count from the page. + + Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex + matching both "N results" and "N properties" patterns.""" + try: + # Try the ZAD targeting JSON script tag first + count = page.evaluate("""() => { + const s = document.querySelector('#__ZAD_TARGETING__'); + if (s) { + try { + const d = JSON.parse(s.textContent); + if (d.search_results_count != null) return d.search_results_count; + } catch(e) {} + } + return null; + }""") + if count is not None and count > 0: + return count + except Exception: + pass + try: body = page.inner_text("body") - match = re.search(r"([\d,]+)\s+results?", body) + match = re.search(r"([\d,]+)\s+(?:results?|properties)", body) if match: return int(match.group(1).replace(",", "")) except Exception: @@ -279,10 +366,42 @@ def _get_result_count(page) -> int: # --------------------------------------------------------------------------- +_first_extraction_logged = False + + def _extract_listings(page) -> list[dict]: """Extract listing data from the current search results page DOM.""" + global _first_extraction_logged try: - return page.evaluate(_EXTRACT_LISTINGS_JS) + listings = page.evaluate(_EXTRACT_LISTINGS_JS) + + # Log diagnostic info on the very first extraction attempt + if not _first_extraction_logged: + _first_extraction_logged = True + try: + diag = page.evaluate("""() => { + const details = document.querySelectorAll('a[href*="/details/"]'); + const testids = document.querySelectorAll('[data-testid]'); + const testidNames = [...new Set([...testids].map(e => e.dataset.testid))]; + return { + url: location.href, + title: document.title, + detailLinks: details.length, + testids: testidNames.slice(0, 30), + bodySnippet: document.body?.innerText?.slice(0, 500) || '', + }; + }""") + log.info( + "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d " + "testids=%s bodySnippet=%.200s", + diag.get("url"), diag.get("title"), diag.get("detailLinks", 0), + diag.get("testids", []), diag.get("bodySnippet", ""), + ) + except Exception: + pass + log.info("Zoopla first extraction: %d listings found", len(listings)) + + return listings except Exception as e: log.warning("Failed to extract listings from DOM: %s", e) zoopla_errors_total.labels(type="extract_failed").inc() @@ -502,19 +621,40 @@ def search_outcode( return [] total_results = _get_result_count(page) - if total_results == 0: - return [] - raw_listings = _paginate(page, total_results, channel) + # Always try extraction even if result count is 0 — the count regex may + # not match Zoopla's current text format, but listings may still be in DOM + raw_listings = _paginate(page, max(total_results, 25), channel) if not raw_listings: + if total_results > 0: + log.debug( + "Zoopla %s %s: page claims %d results but extraction found 0 — " + "DOM selectors may need updating", + outcode, channel, total_results, + ) return [] channel_label = "buy" if channel == "BUY" else "rent" properties = [] + dropped = 0 for raw in raw_listings: transformed = transform_property(raw, channel, pc_index, pc_coords) if transformed: properties.append(transformed) zoopla_properties_scraped.labels(channel=channel_label).inc() + else: + dropped += 1 + + if dropped and not properties: + log.debug( + "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform " + "(no price/postcode/coords)", + outcode, channel, len(raw_listings), dropped, + ) + elif dropped > len(raw_listings) // 2: + log.debug( + "Zoopla %s %s: %d/%d listings dropped in transform", + outcode, channel, dropped, len(raw_listings), + ) return properties