Add back finder
This commit is contained in:
parent
5e5d9f9a1c
commit
48c13fbcdd
23 changed files with 57347 additions and 0 deletions
893
finder/zoopla.py
Normal file
893
finder/zoopla.py
Normal file
|
|
@ -0,0 +1,893 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
|
||||
the working flow requires typing into the autocomplete input, selecting a
|
||||
suggestion, and clicking Search.
|
||||
|
||||
Architecture:
|
||||
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
||||
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
||||
1. Clears and types the outcode into the search input
|
||||
2. Selects the first autocomplete suggestion
|
||||
3. Clicks Search
|
||||
4. Extracts listing data from the rendered DOM
|
||||
5. Handles pagination via ?pn=N parameter
|
||||
|
||||
The browser session replaces the cookie/client pattern used by other scrapers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
||||
class TurnstileError(Exception):
|
||||
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
||||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
# JavaScript to extract listings from the rendered DOM.
|
||||
# Uses data-testid attributes as primary selectors (stable across deployments),
|
||||
# then falls back to href-based link matching with parent-walking.
|
||||
_EXTRACT_LISTINGS_JS = r"""() => {
|
||||
const seen = new Set();
|
||||
const results = [];
|
||||
|
||||
// Strategy 1: Use data-testid selectors (post-2025 redesign)
|
||||
const listingCards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
|
||||
);
|
||||
|
||||
for (const card of listingCards) {
|
||||
const link = card.querySelector(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
);
|
||||
if (!link) continue;
|
||||
|
||||
const href = link.href;
|
||||
const match = href.match(/\/details\/(\d+)\//);
|
||||
if (!match) continue;
|
||||
|
||||
const id = match[1];
|
||||
if (seen.has(id)) continue;
|
||||
seen.add(id);
|
||||
|
||||
const text = card.innerText || '';
|
||||
|
||||
// Try data-testid price element first, then regex
|
||||
const priceEl = card.querySelector('[data-testid="listing-price"]');
|
||||
const priceText = priceEl ? priceEl.innerText : text;
|
||||
const priceMatch = priceText.match(/\u00a3([\d,]+)/);
|
||||
|
||||
// Try address element first, then regex
|
||||
const addressEl = card.querySelector('address');
|
||||
let address = addressEl ? addressEl.innerText.trim() : '';
|
||||
|
||||
if (!address) {
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
for (const line of lines) {
|
||||
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
||||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
||||
address = line;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
||||
let property_type = '';
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
const lower = text.toLowerCase();
|
||||
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
||||
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
||||
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
||||
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
||||
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
||||
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
||||
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
||||
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
||||
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
||||
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
||||
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
||||
}
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
price_text: priceText.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
||||
// Strategy 2: Fall back to href-based link matching with parent-walking
|
||||
if (results.length === 0) {
|
||||
const links = Array.from(document.querySelectorAll(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
));
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.href;
|
||||
const match = href.match(/\/details\/(\d+)\//);
|
||||
if (!match) continue;
|
||||
|
||||
const id = match[1];
|
||||
if (seen.has(id)) continue;
|
||||
seen.add(id);
|
||||
|
||||
let card = link;
|
||||
for (let j = 0; j < 15; j++) {
|
||||
card = card.parentElement;
|
||||
if (!card) break;
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!card) continue;
|
||||
|
||||
const text = card.innerText || '';
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
|
||||
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
||||
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
||||
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
|
||||
let address = '';
|
||||
for (const line of lines) {
|
||||
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
||||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
||||
address = line;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let tenure = '';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type
|
||||
let property_type = '';
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
const lower = text.toLowerCase();
|
||||
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
||||
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
||||
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
||||
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
||||
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
||||
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
||||
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
||||
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
||||
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
||||
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
||||
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
||||
}
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
price_text: priceText2.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}"""
|
||||
|
||||
# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
|
||||
_DISMISS_COOKIES_JS = """() => {
|
||||
const aside = document.querySelector('#usercentrics-cmp-ui');
|
||||
if (aside && aside.shadowRoot) {
|
||||
const btns = aside.shadowRoot.querySelectorAll('button');
|
||||
for (const btn of btns) {
|
||||
if (btn.innerText.includes('Accept')) { btn.click(); return true; }
|
||||
}
|
||||
}
|
||||
if (aside) { aside.remove(); return true; }
|
||||
return false;
|
||||
}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Browser lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def launch_browser():
|
||||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
# Verify camoufox is pre-installed — never download at runtime
|
||||
camoufox_path(download_if_missing=False)
|
||||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
browser = Camoufox(headless=True).__enter__()
|
||||
page = browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(20):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
for frame in page.frames:
|
||||
if "challenges.cloudflare.com" in frame.url:
|
||||
try:
|
||||
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
|
||||
if iframe_el:
|
||||
box = iframe_el.bounding_box()
|
||||
if box:
|
||||
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
time.sleep(3)
|
||||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
|
||||
return browser, page
|
||||
|
||||
|
||||
def _ensure_not_challenged(page) -> None:
|
||||
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
||||
if "Just a moment" not in page.title():
|
||||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(20):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Search navigation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_direct(page, url: str) -> bool:
|
||||
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||
|
||||
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||
by swapping the path component. Falls back gracefully — returns False if
|
||||
the page has no listings, so the caller can retry via the full search flow.
|
||||
"""
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as e:
|
||||
log.debug("Direct navigation failed: %s", e)
|
||||
return False
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for listing content to hydrate
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Check if the page has any listings at all
|
||||
has_listings = page.query_selector('a[href*="/details/"]')
|
||||
if not has_listings:
|
||||
return False
|
||||
time.sleep(1.5)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
|
||||
Returns True if results were found, False if no results or navigation failed.
|
||||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(0.5)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
rent_tab = page.query_selector(
|
||||
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
|
||||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
'input[name="autosuggest-input"]'
|
||||
) or page.query_selector('input[type="text"]')
|
||||
if not search_input:
|
||||
log.warning("Could not find search input on homepage")
|
||||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.1)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(1.2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
if not first_option:
|
||||
log.debug("No autocomplete suggestions for outcode %s", outcode)
|
||||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
if search_btn:
|
||||
search_btn.click()
|
||||
else:
|
||||
search_input.press("Enter")
|
||||
|
||||
# Wait for results to load — try waiting for listings container, fall back to fixed wait
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
'[data-testid="regular-listings"], a[href*="/details/"]',
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||
# The structural container appears in server-rendered HTML before React hydrates
|
||||
# the actual card content — extracting too early yields empty price/address fields.
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Content never appeared — extraction will likely fail but let it try
|
||||
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||
time.sleep(2)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _get_result_count(page) -> int:
|
||||
"""Extract the total results count from the page.
|
||||
|
||||
Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
|
||||
matching both "N results" and "N properties" patterns."""
|
||||
try:
|
||||
# Try the ZAD targeting JSON script tag first
|
||||
count = page.evaluate("""() => {
|
||||
const s = document.querySelector('#__ZAD_TARGETING__');
|
||||
if (s) {
|
||||
try {
|
||||
const d = JSON.parse(s.textContent);
|
||||
if (d.search_results_count != null) return d.search_results_count;
|
||||
} catch(e) {}
|
||||
}
|
||||
return null;
|
||||
}""")
|
||||
if count is not None and count > 0:
|
||||
return count
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
body = page.inner_text("body")
|
||||
match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
|
||||
if match:
|
||||
return int(match.group(1).replace(",", ""))
|
||||
except Exception:
|
||||
pass
|
||||
return 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction and pagination
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_first_extraction_logged = False
|
||||
|
||||
|
||||
def _extract_listings(page) -> list[dict]:
|
||||
"""Extract listing data from the current search results page DOM."""
|
||||
global _first_extraction_logged
|
||||
try:
|
||||
listings = page.evaluate(_EXTRACT_LISTINGS_JS)
|
||||
|
||||
# Log diagnostic info on the very first extraction attempt
|
||||
if not _first_extraction_logged:
|
||||
_first_extraction_logged = True
|
||||
try:
|
||||
diag = page.evaluate("""() => {
|
||||
const details = document.querySelectorAll('a[href*="/details/"]');
|
||||
const testids = document.querySelectorAll('[data-testid]');
|
||||
const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
|
||||
return {
|
||||
url: location.href,
|
||||
title: document.title,
|
||||
detailLinks: details.length,
|
||||
testids: testidNames.slice(0, 30),
|
||||
bodySnippet: document.body?.innerText?.slice(0, 500) || '',
|
||||
};
|
||||
}""")
|
||||
log.info(
|
||||
"Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
|
||||
"testids=%s bodySnippet=%.200s",
|
||||
diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
|
||||
diag.get("testids", []), diag.get("bodySnippet", ""),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
log.info("Zoopla first extraction: %d listings found", len(listings))
|
||||
|
||||
return listings
|
||||
except Exception as e:
|
||||
log.warning("Failed to extract listings from DOM: %s", e)
|
||||
zoopla_errors_total.labels(type="extract_failed").inc()
|
||||
return []
|
||||
|
||||
|
||||
def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, clicks the Next button
|
||||
or navigates via URL parameter ?pn=N."""
|
||||
all_listings = _extract_listings(page)
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
|
||||
if not all_listings or total_results <= len(all_listings):
|
||||
return all_listings
|
||||
|
||||
seen_ids = {listing["id"] for listing in all_listings}
|
||||
current_url = page.url
|
||||
page_num = 2
|
||||
|
||||
while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
# Try navigating via URL parameter
|
||||
if "?" in current_url:
|
||||
next_url = re.sub(r"[?&]pn=\d+", "", current_url)
|
||||
separator = "&" if "?" in next_url else "?"
|
||||
next_url = f"{next_url}{separator}pn={page_num}"
|
||||
else:
|
||||
next_url = f"{current_url}?pn={page_num}"
|
||||
|
||||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
_ensure_not_challenged(page)
|
||||
# Wait for listing content instead of fixed sleep
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(1.5)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.debug("Pagination navigation failed at page %d: %s", page_num, e)
|
||||
break
|
||||
|
||||
page_listings = _extract_listings(page)
|
||||
if not page_listings:
|
||||
break
|
||||
|
||||
# Deduplicate within this outcode
|
||||
new_count = 0
|
||||
for listing in page_listings:
|
||||
if listing["id"] not in seen_ids:
|
||||
seen_ids.add(listing["id"])
|
||||
all_listings.append(listing)
|
||||
new_count += 1
|
||||
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
|
||||
if new_count == 0:
|
||||
break # No new listings on this page
|
||||
|
||||
page_num += 1
|
||||
|
||||
return all_listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property transformation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
|
||||
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
|
||||
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
|
||||
|
||||
|
||||
def _resolve_outcode_coords(
|
||||
outcode: str, pc_coords: dict[str, tuple[float, float]]
|
||||
) -> tuple[str, float, float] | None:
|
||||
"""Find first postcode + coords for an outcode. Result is cached."""
|
||||
if outcode in _outcode_coords_cache:
|
||||
return _outcode_coords_cache[outcode]
|
||||
|
||||
prefix = outcode + " "
|
||||
for pcd, (lat, lng) in pc_coords.items():
|
||||
if pcd.startswith(prefix) or (
|
||||
len(outcode) >= 4
|
||||
and pcd.startswith(outcode)
|
||||
and len(pcd) > len(outcode)
|
||||
):
|
||||
_outcode_coords_cache[outcode] = (pcd, lat, lng)
|
||||
return (pcd, lat, lng)
|
||||
|
||||
_outcode_coords_cache[outcode] = None
|
||||
return None
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
|
||||
Normalizes to include a space before the 3-char incode."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
raw = match.group(1).upper().strip()
|
||||
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
|
||||
if " " not in raw and len(raw) >= 5:
|
||||
return raw[:-3] + " " + raw[-3:]
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
def _extract_outcode(text: str) -> str | None:
|
||||
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
|
||||
# Look for outcode at end of string or after last comma
|
||||
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
# Try after comma
|
||||
parts = text.split(",")
|
||||
if len(parts) > 1:
|
||||
last = parts[-1].strip()
|
||||
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
return None
|
||||
|
||||
|
||||
def _map_property_type(raw_type: str | None) -> str:
|
||||
"""Map Zoopla property type text to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
# Exact match (handles Rightmove-style capitalised values)
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
||||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
if "detach" in lower:
|
||||
return "Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
return "Detached"
|
||||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
|
||||
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
|
||||
often display both monthly and weekly prices in the same text. When the
|
||||
JS extraction falls back to full card text, checking pcm first ensures
|
||||
the captured monthly price gets the correct frequency label.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
|
||||
return "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# No indicator — default monthly (Zoopla standard)
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
||||
# Resolve postcode and coordinates from address
|
||||
postcode = _extract_postcode(address)
|
||||
lat = lng = None
|
||||
|
||||
if postcode:
|
||||
coords = pc_coords.get(postcode)
|
||||
if coords:
|
||||
lat, lng = coords
|
||||
|
||||
if lat is None:
|
||||
# Try outcode-level fallback from address text
|
||||
addr_outcode = _extract_outcode(address)
|
||||
if addr_outcode:
|
||||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
# Final fallback: use the outcode we know we're searching
|
||||
if lat is None and search_outcode:
|
||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
if lat is None or lng is None or not postcode:
|
||||
return None
|
||||
|
||||
# Validate coordinates are in England
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
raw.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
receptions = raw.get("receptions") or 0
|
||||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + receptions,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
"Listing features": [],
|
||||
"first_visible_date": "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Top-level search function (called by scraper.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def search_outcode(
|
||||
page,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
base_search_url: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
||||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
If base_search_url is provided (from a previous channel search for the same
|
||||
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||
search flow. Falls back to full navigation if direct fails.
|
||||
|
||||
Returns (properties, search_url) where search_url can be passed to the next
|
||||
channel call for this outcode.
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
navigated = False
|
||||
if base_search_url:
|
||||
navigated = _navigate_direct(page, base_search_url)
|
||||
if navigated:
|
||||
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||
|
||||
if not navigated:
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return [], None
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
# Always try extraction even if result count is 0 — the count regex may
|
||||
# not match Zoopla's current text format, but listings may still be in DOM
|
||||
raw_listings = _paginate(page, max(total_results, 25), channel)
|
||||
if not raw_listings:
|
||||
if total_results > 0:
|
||||
log.debug(
|
||||
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
||||
"DOM selectors may need updating",
|
||||
outcode, channel, total_results,
|
||||
)
|
||||
return [], None
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
else:
|
||||
dropped += 1
|
||||
|
||||
if dropped and not properties:
|
||||
# Log a sample raw listing to diagnose which fields are missing
|
||||
sample = raw_listings[0] if raw_listings else {}
|
||||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, channel, len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
elif dropped > len(raw_listings) // 2:
|
||||
log.debug(
|
||||
"Zoopla %s %s: %d/%d listings dropped in transform",
|
||||
outcode, channel, dropped, len(raw_listings),
|
||||
)
|
||||
|
||||
return properties, page.url
|
||||
Loading…
Add table
Add a link
Reference in a new issue