This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -1,4 +1,4 @@
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
"""Zoopla (zoopla.co.uk) scraper — sale properties.
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
@ -6,18 +6,14 @@ patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
Zoopla uses Next.js App Router with React Server Components (RSC). Search
result data is server-rendered in an RSC stream, not available via
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results
the working flow requires typing into the autocomplete input, selecting a
suggestion, and clicking Search.
__NEXT_DATA__ or a JSON API.
Architecture:
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
a single Camoufox browser alive for the entire scrape. For each outcode, it:
1. Clears and types the outcode into the search input
2. Selects the first autocomplete suggestion
3. Clicks Search
4. Extracts listing data from the rendered DOM
5. Handles pagination via ?pn=N parameter
1. Navigates directly to the sale search URL
2. Extracts listing data from the rendered DOM
3. Handles pagination via ?pn=N parameter
The browser session replaces the cookie/client pattern used by other scrapers.
"""
@ -27,7 +23,6 @@ import re
import time
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, validate_floor_area
@ -38,6 +33,25 @@ class TurnstileError(Exception):
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
class _ManagedCamoufoxBrowser:
def __init__(self, context_manager, browser):
self._context_manager = context_manager
self._browser = browser
self._closed = False
def close(self) -> None:
if self._closed:
return
self._closed = True
try:
self._browser.close()
finally:
self._context_manager.__exit__(None, None, None)
def __getattr__(self, name):
return getattr(self._browser, name)
# Maximum search result pages to scrape per outcode (25 listings/page)
MAX_PAGES_PER_OUTCODE = 40
@ -55,7 +69,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
for (const card of listingCards) {
const link = card.querySelector(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
);
if (!link) continue;
@ -100,9 +114,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
// Extract property type (e.g., "2 bed flat for sale" "flat")
let property_type = '';
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
if (ptMatch) property_type = ptMatch[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
// Keyword fallback when regex doesn't match current DOM format
if (!property_type) {
@ -135,7 +149,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
// Strategy 2: Fall back to href-based link matching with parent-walking
if (results.length === 0) {
const links = Array.from(document.querySelectorAll(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
));
for (const link of links) {
@ -184,9 +198,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
// Extract property type
let property_type = '';
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
if (ptMatch2) property_type = ptMatch2[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
// Keyword fallback when regex doesn't match current DOM format
if (!property_type) {
@ -243,17 +257,20 @@ def launch_browser():
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
and dismiss cookie consent. Returns (browser, page) tuple.
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
Caller must close browser when done."""
from camoufox.pkgman import camoufox_path
# Verify camoufox is pre-installed — never download at runtime
camoufox_path(download_if_missing=False)
# Standalone local runs should not require the old container image to have
# pre-fetched Camoufox.
camoufox_path(download_if_missing=True)
from camoufox.sync_api import Camoufox
log.info("Launching Camoufox browser for Zoopla...")
browser = Camoufox(headless=True).__enter__()
camoufox = Camoufox(headless=True)
raw_browser = camoufox.__enter__()
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
page = browser.new_page()
log.info("Navigating to Zoopla homepage...")
@ -261,7 +278,7 @@ def launch_browser():
# Wait for Cloudflare Turnstile to resolve.
# Try clicking the Turnstile checkbox if present (helps in some cases).
for i in range(20):
for i in range(40):
if "Just a moment" not in page.title():
break
# Attempt to click the Turnstile checkbox in the challenge iframe
@ -280,7 +297,7 @@ def launch_browser():
else:
page.close()
browser.close()
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
log.info("Cloudflare passed — title: %s", page.title())
time.sleep(2)
@ -298,13 +315,13 @@ def _ensure_not_challenged(page) -> None:
return
log.warning("Cloudflare challenge detected mid-session, waiting...")
for i in range(20):
for i in range(40):
time.sleep(3)
if "Just a moment" not in page.title():
log.info("Cloudflare challenge resolved")
return
raise TurnstileError("Cloudflare re-challenge did not resolve")
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
# ---------------------------------------------------------------------------
@ -312,21 +329,8 @@ def _ensure_not_challenged(page) -> None:
# ---------------------------------------------------------------------------
def _navigate_direct(page, url: str) -> bool:
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
Used to load the second channel (e.g., RENT after BUY) for the same outcode
by swapping the path component. Falls back gracefully returns False if
the page has no listings, so the caller can retry via the full search flow.
"""
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as e:
log.debug("Direct navigation failed: %s", e)
return False
_ensure_not_challenged(page)
# Wait for listing content to hydrate
def _wait_for_listing_content(page) -> None:
"""Wait for rendered listing cards to contain usable text."""
try:
page.wait_for_function(
"""() => {
@ -343,100 +347,42 @@ def _navigate_direct(page, url: str) -> bool:
timeout=8000,
)
except Exception:
# Check if the page has any listings at all
has_listings = page.query_selector('a[href*="/details/"]')
if not has_listings:
return False
time.sleep(1.5)
return True
def _navigate_search(page, outcode: str, channel: str) -> bool:
"""Navigate to search results for an outcode via the homepage search flow.
def _navigate_search(page, outcode: str) -> bool:
"""Navigate directly to sale search results for an outcode.
Returns True if results were found, False if no results or navigation failed.
Raises TurnstileError if Cloudflare blocks us."""
# Navigate to homepage to reset search state
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
time.sleep(0.5)
url = (
f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
f"?q={outcode}&search_source=home"
)
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as exc:
log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
return False
_ensure_not_challenged(page)
# Dismiss cookie consent (may reappear after navigation)
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(0.3)
try:
page.evaluate(_DISMISS_COOKIES_JS)
except Exception:
pass
# Select Buy/Rent tab
if channel == "RENT":
rent_tab = page.query_selector(
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
)
if rent_tab:
rent_tab.click()
time.sleep(0.2)
# Find and fill search input
search_input = page.query_selector(
'input[name="autosuggest-input"]'
) or page.query_selector('input[type="text"]')
if not search_input:
log.warning("Could not find search input on homepage")
return False
search_input.click()
time.sleep(0.1)
search_input.fill("")
search_input.type(outcode, delay=60)
time.sleep(1.2)
# Select first autocomplete suggestion
first_option = page.query_selector('[role="option"]')
if not first_option:
log.debug("No autocomplete suggestions for outcode %s", outcode)
return False
first_option.click()
time.sleep(0.2)
# Click search button
search_btn = page.query_selector('button:has-text("Search")')
if search_btn:
search_btn.click()
else:
search_input.press("Enter")
# Wait for results to load — try waiting for listings container, fall back to fixed wait
try:
page.wait_for_selector(
'[data-testid="regular-listings"], a[href*="/details/"]',
'[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
timeout=10000,
)
except Exception:
time.sleep(4)
_ensure_not_challenged(page)
if not page.query_selector('a[href*="/details/"]'):
return False
# Wait for client-side hydration to populate listing content (prices, addresses).
# The structural container appears in server-rendered HTML before React hydrates
# the actual card content — extracting too early yields empty price/address fields.
try:
page.wait_for_function(
"""() => {
const cards = document.querySelectorAll(
'[data-testid="regular-listings"] > div'
);
if (cards.length === 0) return false;
for (const card of cards) {
const t = card.innerText || '';
if (t.includes('\\u00a3') && t.length > 50) return true;
}
return false;
}""",
timeout=8000,
)
except Exception:
# Content never appeared — extraction will likely fail but let it try
log.debug("Listing content hydration wait timed out — prices may not have rendered")
time.sleep(2)
_wait_for_listing_content(page)
return True
@ -516,18 +462,21 @@ def _extract_listings(page) -> list[dict]:
return listings
except Exception as e:
log.warning("Failed to extract listings from DOM: %s", e)
zoopla_errors_total.labels(type="extract_failed").inc()
return []
def _paginate(page, total_results: int, channel: str) -> list[dict]:
def _paginate(
page,
total_results: int,
max_properties: int | None = None,
) -> list[dict]:
"""Extract listings from all pages of search results.
Page 1 is already loaded. For subsequent pages, clicks the Next button
or navigates via URL parameter ?pn=N."""
all_listings = _extract_listings(page)
channel_label = "buy" if channel == "BUY" else "rent"
zoopla_pages_scraped.labels(channel=channel_label).inc()
if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties]
if not all_listings or total_results <= len(all_listings):
return all_listings
@ -550,24 +499,7 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
_ensure_not_challenged(page)
# Wait for listing content instead of fixed sleep
try:
page.wait_for_function(
"""() => {
const cards = document.querySelectorAll(
'[data-testid="regular-listings"] > div'
);
if (cards.length === 0) return false;
for (const card of cards) {
const t = card.innerText || '';
if (t.includes('\\u00a3') && t.length > 50) return true;
}
return false;
}""",
timeout=8000,
)
except Exception:
time.sleep(1.5)
_wait_for_listing_content(page)
except TurnstileError:
raise
except Exception as e:
@ -585,8 +517,8 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
seen_ids.add(listing["id"])
all_listings.append(listing)
new_count += 1
zoopla_pages_scraped.labels(channel=channel_label).inc()
if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties]
if new_count == 0:
break # No new listings on this page
@ -692,31 +624,8 @@ def _map_property_type(raw_type: str | None) -> str:
return "Other"
def _detect_rent_frequency(price_text: str) -> str:
"""Detect rent frequency from Zoopla price text.
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
often display both monthly and weekly prices in the same text. When the
JS extraction falls back to full card text, checking pcm first ensures
the captured monthly price gets the correct frequency label.
"""
lower = price_text.lower()
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
return "monthly"
if "pw" in lower or "per week" in lower or "/w" in lower:
return "weekly"
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
return "yearly"
# No indicator — default monthly (Zoopla standard)
return "monthly"
def transform_property(
raw: dict,
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
search_outcode: str | None = None,
@ -783,13 +692,6 @@ def transform_property(
if listing_url and not listing_url.startswith("http"):
listing_url = ZOOPLA_BASE + listing_url
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
if channel == "BUY":
frequency = ""
else:
price_text = raw.get("price_text", "")
frequency = _detect_rent_frequency(price_text)
return {
"id": f"zp_{listing_id}",
"Bedrooms": bedrooms,
@ -803,7 +705,7 @@ def transform_property(
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),
"price": int(price),
"price_frequency": frequency,
"price_frequency": "",
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
"Listing URL": listing_url,
@ -820,10 +722,9 @@ def transform_property(
def search_outcode(
page,
outcode: str,
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
base_search_url: str | None = None,
max_properties: int | None = None,
) -> tuple[list[dict], str | None]:
"""Search Zoopla for properties in one outcode.
@ -831,47 +732,37 @@ def search_outcode(
search flow, extracts listings from rendered DOM, and transforms to the
standard output schema.
If base_search_url is provided (from a previous channel search for the same
outcode), tries direct URL navigation first skipping the slow homepage
search flow. Falls back to full navigation if direct fails.
Returns (properties, search_url) where search_url can be passed to the next
channel call for this outcode.
Returns (properties, search_url).
Raises TurnstileError if Cloudflare blocks us mid-session.
"""
navigated = False
if base_search_url:
navigated = _navigate_direct(page, base_search_url)
if navigated:
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
if not navigated:
if not _navigate_search(page, outcode, channel):
return [], None
if not _navigate_search(page, outcode):
return [], None
total_results = _get_result_count(page)
# Always try extraction even if result count is 0 — the count regex may
# not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate(page, max(total_results, 25), channel)
raw_listings = _paginate(
page,
max(total_results, 25),
max_properties=max_properties,
)
if not raw_listings:
if total_results > 0:
log.debug(
"Zoopla %s %s: page claims %d results but extraction found 0 — "
"DOM selectors may need updating",
outcode, channel, total_results,
outcode, "BUY", total_results,
)
return [], None
channel_label = "buy" if channel == "BUY" else "rent"
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
if transformed:
properties.append(transformed)
zoopla_properties_scraped.labels(channel=channel_label).inc()
else:
dropped += 1
@ -881,13 +772,13 @@ def search_outcode(
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
"(no price/postcode/coords). Sample raw: price=%s address=%r",
outcode, channel, len(raw_listings), dropped,
outcode, "BUY", len(raw_listings), dropped,
sample.get("price"), sample.get("address", ""),
)
elif dropped > len(raw_listings) // 2:
log.debug(
"Zoopla %s %s: %d/%d listings dropped in transform",
outcode, channel, dropped, len(raw_listings),
outcode, "BUY", dropped, len(raw_listings),
)
return properties, page.url