all good
This commit is contained in:
parent
47d89f6fad
commit
017902b8e6
82 changed files with 331466 additions and 54841 deletions
283
finder/zoopla.py
283
finder/zoopla.py
|
|
@ -1,4 +1,4 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
|
||||
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
|
|
@ -6,18 +6,14 @@ patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
|||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
|
||||
the working flow requires typing into the autocomplete input, selecting a
|
||||
suggestion, and clicking Search.
|
||||
__NEXT_DATA__ or a JSON API.
|
||||
|
||||
Architecture:
|
||||
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
||||
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
||||
1. Clears and types the outcode into the search input
|
||||
2. Selects the first autocomplete suggestion
|
||||
3. Clicks Search
|
||||
4. Extracts listing data from the rendered DOM
|
||||
5. Handles pagination via ?pn=N parameter
|
||||
1. Navigates directly to the sale search URL
|
||||
2. Extracts listing data from the rendered DOM
|
||||
3. Handles pagination via ?pn=N parameter
|
||||
|
||||
The browser session replaces the cookie/client pattern used by other scrapers.
|
||||
"""
|
||||
|
|
@ -27,7 +23,6 @@ import re
|
|||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
|
||||
|
|
@ -38,6 +33,25 @@ class TurnstileError(Exception):
|
|||
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
||||
|
||||
|
||||
class _ManagedCamoufoxBrowser:
|
||||
def __init__(self, context_manager, browser):
|
||||
self._context_manager = context_manager
|
||||
self._browser = browser
|
||||
self._closed = False
|
||||
|
||||
def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
try:
|
||||
self._browser.close()
|
||||
finally:
|
||||
self._context_manager.__exit__(None, None, None)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._browser, name)
|
||||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
|
|
@ -55,7 +69,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
for (const card of listingCards) {
|
||||
const link = card.querySelector(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
||||
);
|
||||
if (!link) continue;
|
||||
|
||||
|
|
@ -100,9 +114,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
||||
let property_type = '';
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
||||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
|
|
@ -135,7 +149,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
// Strategy 2: Fall back to href-based link matching with parent-walking
|
||||
if (results.length === 0) {
|
||||
const links = Array.from(document.querySelectorAll(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
||||
));
|
||||
|
||||
for (const link of links) {
|
||||
|
|
@ -184,9 +198,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
// Extract property type
|
||||
let property_type = '';
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
||||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
|
|
@ -243,17 +257,20 @@ def launch_browser():
|
|||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
|
||||
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
# Verify camoufox is pre-installed — never download at runtime
|
||||
camoufox_path(download_if_missing=False)
|
||||
# Standalone local runs should not require the old container image to have
|
||||
# pre-fetched Camoufox.
|
||||
camoufox_path(download_if_missing=True)
|
||||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
browser = Camoufox(headless=True).__enter__()
|
||||
camoufox = Camoufox(headless=True)
|
||||
raw_browser = camoufox.__enter__()
|
||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||
page = browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
|
|
@ -261,7 +278,7 @@ def launch_browser():
|
|||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(20):
|
||||
for i in range(40):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
|
|
@ -280,7 +297,7 @@ def launch_browser():
|
|||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
|
@ -298,13 +315,13 @@ def _ensure_not_challenged(page) -> None:
|
|||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(20):
|
||||
for i in range(40):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve")
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -312,21 +329,8 @@ def _ensure_not_challenged(page) -> None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_direct(page, url: str) -> bool:
|
||||
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||
|
||||
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||
by swapping the path component. Falls back gracefully — returns False if
|
||||
the page has no listings, so the caller can retry via the full search flow.
|
||||
"""
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as e:
|
||||
log.debug("Direct navigation failed: %s", e)
|
||||
return False
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for listing content to hydrate
|
||||
def _wait_for_listing_content(page) -> None:
|
||||
"""Wait for rendered listing cards to contain usable text."""
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
|
|
@ -343,100 +347,42 @@ def _navigate_direct(page, url: str) -> bool:
|
|||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Check if the page has any listings at all
|
||||
has_listings = page.query_selector('a[href*="/details/"]')
|
||||
if not has_listings:
|
||||
return False
|
||||
time.sleep(1.5)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
def _navigate_search(page, outcode: str) -> bool:
|
||||
"""Navigate directly to sale search results for an outcode.
|
||||
|
||||
Returns True if results were found, False if no results or navigation failed.
|
||||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(0.5)
|
||||
url = (
|
||||
f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
|
||||
f"?q={outcode}&search_source=home"
|
||||
)
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as exc:
|
||||
log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
|
||||
return False
|
||||
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(0.3)
|
||||
try:
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
rent_tab = page.query_selector(
|
||||
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
|
||||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
'input[name="autosuggest-input"]'
|
||||
) or page.query_selector('input[type="text"]')
|
||||
if not search_input:
|
||||
log.warning("Could not find search input on homepage")
|
||||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.1)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(1.2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
if not first_option:
|
||||
log.debug("No autocomplete suggestions for outcode %s", outcode)
|
||||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
if search_btn:
|
||||
search_btn.click()
|
||||
else:
|
||||
search_input.press("Enter")
|
||||
|
||||
# Wait for results to load — try waiting for listings container, fall back to fixed wait
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
'[data-testid="regular-listings"], a[href*="/details/"]',
|
||||
'[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
if not page.query_selector('a[href*="/details/"]'):
|
||||
return False
|
||||
|
||||
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||
# The structural container appears in server-rendered HTML before React hydrates
|
||||
# the actual card content — extracting too early yields empty price/address fields.
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Content never appeared — extraction will likely fail but let it try
|
||||
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||
time.sleep(2)
|
||||
_wait_for_listing_content(page)
|
||||
|
||||
return True
|
||||
|
||||
|
|
@ -516,18 +462,21 @@ def _extract_listings(page) -> list[dict]:
|
|||
return listings
|
||||
except Exception as e:
|
||||
log.warning("Failed to extract listings from DOM: %s", e)
|
||||
zoopla_errors_total.labels(type="extract_failed").inc()
|
||||
return []
|
||||
|
||||
|
||||
def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
||||
def _paginate(
|
||||
page,
|
||||
total_results: int,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, clicks the Next button
|
||||
or navigates via URL parameter ?pn=N."""
|
||||
all_listings = _extract_listings(page)
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
if not all_listings or total_results <= len(all_listings):
|
||||
return all_listings
|
||||
|
|
@ -550,24 +499,7 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
_ensure_not_challenged(page)
|
||||
# Wait for listing content instead of fixed sleep
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(1.5)
|
||||
_wait_for_listing_content(page)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
@ -585,8 +517,8 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
seen_ids.add(listing["id"])
|
||||
all_listings.append(listing)
|
||||
new_count += 1
|
||||
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
if new_count == 0:
|
||||
break # No new listings on this page
|
||||
|
|
@ -692,31 +624,8 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
|
||||
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
|
||||
often display both monthly and weekly prices in the same text. When the
|
||||
JS extraction falls back to full card text, checking pcm first ensures
|
||||
the captured monthly price gets the correct frequency label.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
|
||||
return "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# No indicator — default monthly (Zoopla standard)
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
|
|
@ -783,13 +692,6 @@ def transform_property(
|
|||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
|
|
@ -803,7 +705,7 @@ def transform_property(
|
|||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
|
|
@ -820,10 +722,9 @@ def transform_property(
|
|||
def search_outcode(
|
||||
page,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
base_search_url: str | None = None,
|
||||
max_properties: int | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
|
|
@ -831,47 +732,37 @@ def search_outcode(
|
|||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
If base_search_url is provided (from a previous channel search for the same
|
||||
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||
search flow. Falls back to full navigation if direct fails.
|
||||
|
||||
Returns (properties, search_url) where search_url can be passed to the next
|
||||
channel call for this outcode.
|
||||
Returns (properties, search_url).
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
navigated = False
|
||||
if base_search_url:
|
||||
navigated = _navigate_direct(page, base_search_url)
|
||||
if navigated:
|
||||
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||
|
||||
if not navigated:
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return [], None
|
||||
if not _navigate_search(page, outcode):
|
||||
return [], None
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
# Always try extraction even if result count is 0 — the count regex may
|
||||
# not match Zoopla's current text format, but listings may still be in DOM
|
||||
raw_listings = _paginate(page, max(total_results, 25), channel)
|
||||
raw_listings = _paginate(
|
||||
page,
|
||||
max(total_results, 25),
|
||||
max_properties=max_properties,
|
||||
)
|
||||
if not raw_listings:
|
||||
if total_results > 0:
|
||||
log.debug(
|
||||
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
||||
"DOM selectors may need updating",
|
||||
outcode, channel, total_results,
|
||||
outcode, "BUY", total_results,
|
||||
)
|
||||
return [], None
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
|
||||
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
else:
|
||||
dropped += 1
|
||||
|
||||
|
|
@ -881,13 +772,13 @@ def search_outcode(
|
|||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, channel, len(raw_listings), dropped,
|
||||
outcode, "BUY", len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
elif dropped > len(raw_listings) // 2:
|
||||
log.debug(
|
||||
"Zoopla %s %s: %d/%d listings dropped in transform",
|
||||
outcode, channel, dropped, len(raw_listings),
|
||||
outcode, "BUY", dropped, len(raw_listings),
|
||||
)
|
||||
|
||||
return properties, page.url
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue