Morning improvements
This commit is contained in:
parent
3e9fba5303
commit
53fff3efaa
41 changed files with 2438 additions and 637 deletions
319
scripts/zoopla_experiment.py
Executable file
319
scripts/zoopla_experiment.py
Executable file
|
|
@ -0,0 +1,319 @@
|
|||
#!/usr/bin/env -S uv run --project ../finder
|
||||
"""Zoopla scraping experiment — Playwright with stealth + network interception.
|
||||
|
||||
Zoopla uses Next.js App Router with React Server Components. The listing data
|
||||
is NOT in __NEXT_DATA__ or the initial HTML — it's fetched client-side after
|
||||
hydration. This means we need a real browser that:
|
||||
1. Passes Cloudflare's bot detection
|
||||
2. Executes JavaScript to trigger the client-side data fetch
|
||||
3. Intercepts the network response OR scrapes the rendered DOM
|
||||
|
||||
Usage:
|
||||
uv run --project finder scripts/zoopla_experiment.py [OUTCODE]
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger("zoopla-exp")
|
||||
|
||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||
|
||||
CHANNELS = {
|
||||
"BUY": "for-sale",
|
||||
"RENT": "to-rent",
|
||||
}
|
||||
|
||||
|
||||
def run_playwright_stealth(outcode: str, channel: str = "BUY"):
|
||||
"""Use Playwright with stealth patches to scrape Zoopla.
|
||||
|
||||
Strategy:
|
||||
1. Launch stealth browser to bypass Cloudflare
|
||||
2. Navigate to search page
|
||||
3. Wait for listings to render (client-side hydration)
|
||||
4. Try two extraction methods:
|
||||
a. Intercept network requests for API data (cleanest)
|
||||
b. Parse the rendered DOM (fallback)
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
url_segment = CHANNELS[channel]
|
||||
search_url = f"{ZOOPLA_BASE}/{url_segment}/properties/{outcode.lower()}/"
|
||||
log.info("Target: %s", search_url)
|
||||
|
||||
intercepted_data = []
|
||||
|
||||
def handle_response(response):
|
||||
"""Capture any API responses that look like listing data."""
|
||||
url = response.url
|
||||
# Look for API/data endpoints
|
||||
if any(kw in url for kw in ["/api/", "graphql", "search", "listing", "property"]):
|
||||
try:
|
||||
if "application/json" in (response.headers.get("content-type", "")):
|
||||
body = response.json()
|
||||
intercepted_data.append({"url": url, "data": body})
|
||||
log.info(" [intercepted] %s (%s)", url[:100], type(body).__name__)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with sync_playwright() as p:
|
||||
# Launch with stealth-friendly args
|
||||
browser = p.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-web-security",
|
||||
"--lang=en-GB",
|
||||
],
|
||||
)
|
||||
context = browser.new_context(
|
||||
locale="en-GB",
|
||||
timezone_id="Europe/London",
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
),
|
||||
)
|
||||
page = context.new_page()
|
||||
|
||||
# Apply stealth patches (Linux platform, Chrome UA)
|
||||
stealth = Stealth(
|
||||
navigator_platform_override="Linux x86_64",
|
||||
navigator_languages_override=("en-GB", "en"),
|
||||
)
|
||||
stealth.apply_stealth_sync(page)
|
||||
|
||||
# Listen for responses to intercept API data
|
||||
page.on("response", handle_response)
|
||||
|
||||
# Navigate
|
||||
log.info("Navigating to %s ...", search_url)
|
||||
try:
|
||||
page.goto(search_url, wait_until="domcontentloaded", timeout=60000)
|
||||
except Exception as e:
|
||||
log.error("Navigation failed: %s", e)
|
||||
browser.close()
|
||||
return
|
||||
|
||||
# Wait for Cloudflare to resolve
|
||||
log.info("Waiting for Cloudflare challenge to resolve ...")
|
||||
for attempt in range(20):
|
||||
content = page.content()
|
||||
title = page.title()
|
||||
if "Just a moment" in content and "challenge" in content.lower():
|
||||
log.info(" Cloudflare challenge still active (%d/20) title=%s", attempt + 1, title)
|
||||
time.sleep(3)
|
||||
else:
|
||||
log.info(" Challenge resolved! title=%s", title)
|
||||
break
|
||||
else:
|
||||
log.error("Cloudflare challenge did not resolve")
|
||||
# Dump page content for debugging
|
||||
print("\n=== Cloudflare challenge page ===")
|
||||
print(page.content()[:3000])
|
||||
browser.close()
|
||||
return
|
||||
|
||||
# Wait for actual content to render
|
||||
log.info("Waiting for listing content to render ...")
|
||||
try:
|
||||
# Try waiting for property cards to appear
|
||||
page.wait_for_selector(
|
||||
'[data-testid="search-result"], [data-testid="regular-listings"], '
|
||||
'.listing-results, .css-kdnlof, [class*="ListingCard"], '
|
||||
'[class*="listing"], [class*="PropertyCard"]',
|
||||
timeout=15000,
|
||||
)
|
||||
log.info("Listing elements found in DOM!")
|
||||
except Exception:
|
||||
log.warning("No listing elements found by selector. Trying to wait for prices...")
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"document.querySelectorAll('a[href*=\"/for-sale/details/\"]').length > 0",
|
||||
timeout=15000,
|
||||
)
|
||||
log.info("Listing links found in DOM!")
|
||||
except Exception:
|
||||
log.warning("No listing links either. Page may still be loading or we're blocked.")
|
||||
|
||||
# Give hydration a moment
|
||||
time.sleep(3)
|
||||
|
||||
# --- Extraction Method A: Check intercepted network data ---
|
||||
if intercepted_data:
|
||||
print(f"\n=== Intercepted {len(intercepted_data)} API responses ===")
|
||||
for item in intercepted_data:
|
||||
print(f"\nURL: {item['url'][:150]}")
|
||||
data = item["data"]
|
||||
if isinstance(data, dict):
|
||||
print(f"Keys: {list(data.keys())[:15]}")
|
||||
# Look for listings inside
|
||||
for k, v in data.items():
|
||||
if isinstance(v, list) and len(v) > 2 and isinstance(v[0], dict):
|
||||
print(f" {k}: list of {len(v)} items, [0] keys={list(v[0].keys())[:10]}")
|
||||
elif isinstance(data, list) and data:
|
||||
print(f"Array of {len(data)} items")
|
||||
if isinstance(data[0], dict):
|
||||
print(f" [0] keys: {list(data[0].keys())[:15]}")
|
||||
print(json.dumps(data, indent=2, default=str, ensure_ascii=False)[:2000])
|
||||
|
||||
# --- Extraction Method B: Parse rendered DOM ---
|
||||
log.info("Extracting from rendered DOM ...")
|
||||
|
||||
# Get full page content after hydration
|
||||
content = page.content()
|
||||
|
||||
# Find listing URLs
|
||||
listing_urls = re.findall(r'href="(/for-sale/details/\d+/[^"]*)"', content)
|
||||
log.info("Found %d listing detail links", len(listing_urls))
|
||||
|
||||
# Find prices
|
||||
prices = re.findall(r'£([\d,]+)', content)
|
||||
log.info("Found %d price strings", len(prices))
|
||||
if prices:
|
||||
log.info("Prices: %s", prices[:10])
|
||||
|
||||
# Try to extract structured listing data from the page
|
||||
listings = page.evaluate("""() => {
|
||||
// Try to find listing cards via various selectors
|
||||
const selectors = [
|
||||
'[data-testid="search-result"]',
|
||||
'[data-testid="regular-listings"] > div',
|
||||
'a[href*="/for-sale/details/"]',
|
||||
'[class*="ListingCard"]',
|
||||
'[class*="listing-result"]',
|
||||
];
|
||||
|
||||
for (const sel of selectors) {
|
||||
const elements = document.querySelectorAll(sel);
|
||||
if (elements.length > 2) {
|
||||
return {
|
||||
selector: sel,
|
||||
count: elements.length,
|
||||
// Get text and href from first 3
|
||||
samples: Array.from(elements).slice(0, 3).map(el => ({
|
||||
text: el.innerText?.substring(0, 300),
|
||||
href: el.href || el.querySelector('a')?.href || '',
|
||||
html: el.outerHTML?.substring(0, 500),
|
||||
}))
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find all links to listing detail pages
|
||||
const links = Array.from(document.querySelectorAll('a[href*="/details/"]'));
|
||||
if (links.length > 0) {
|
||||
return {
|
||||
selector: 'a[href*="/details/"]',
|
||||
count: links.length,
|
||||
samples: links.slice(0, 5).map(el => ({
|
||||
text: el.innerText?.substring(0, 300),
|
||||
href: el.href,
|
||||
parentText: el.closest('div, li, article')?.innerText?.substring(0, 500) || '',
|
||||
}))
|
||||
};
|
||||
}
|
||||
|
||||
// Last resort: get page structure
|
||||
return {
|
||||
selector: 'none',
|
||||
count: 0,
|
||||
bodyText: document.body?.innerText?.substring(0, 2000),
|
||||
title: document.title,
|
||||
};
|
||||
}""")
|
||||
|
||||
print(f"\n=== DOM Extraction Results ===")
|
||||
print(json.dumps(listings, indent=2, ensure_ascii=False)[:5000])
|
||||
|
||||
# Also extract cookies for potential reuse
|
||||
cookies = context.cookies()
|
||||
zoopla_cookies = {c["name"]: c["value"] for c in cookies if ".zoopla.co.uk" in c.get("domain", "")}
|
||||
ua = page.evaluate("navigator.userAgent")
|
||||
|
||||
print(f"\n=== Session Info ===")
|
||||
print(f"Cookies ({len(zoopla_cookies)}): {list(zoopla_cookies.keys())}")
|
||||
print(f"User-Agent: {ua}")
|
||||
|
||||
if zoopla_cookies:
|
||||
# Save cookies for reuse
|
||||
print(f"\n=== Reusable cookie env vars ===")
|
||||
for name, value in zoopla_cookies.items():
|
||||
print(f" {name}={value[:50]}...")
|
||||
|
||||
# --- Try a detail page if we found any listing URLs ---
|
||||
if listing_urls:
|
||||
detail_path = listing_urls[0]
|
||||
detail_url = f"{ZOOPLA_BASE}{detail_path}"
|
||||
log.info("--- Fetching detail page: %s ---", detail_url)
|
||||
time.sleep(2)
|
||||
|
||||
page.goto(detail_url, wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(5) # Let it hydrate
|
||||
|
||||
detail = page.evaluate("""() => {
|
||||
const result = {};
|
||||
|
||||
// Price
|
||||
const priceEl = document.querySelector('[data-testid="price"]')
|
||||
|| document.querySelector('[class*="price"]');
|
||||
result.price = priceEl?.innerText || '';
|
||||
|
||||
// Address
|
||||
const addrEl = document.querySelector('[data-testid="address-label"]')
|
||||
|| document.querySelector('h1') || document.querySelector('address');
|
||||
result.address = addrEl?.innerText || '';
|
||||
|
||||
// Key features
|
||||
const features = Array.from(document.querySelectorAll('[data-testid="listing_feature"] li, [class*="feature"] li'));
|
||||
result.features = features.map(f => f.innerText).slice(0, 15);
|
||||
|
||||
// Bedrooms/bathrooms from icons or text
|
||||
const specs = document.querySelectorAll('[data-testid="beds-label"], [data-testid="baths-label"], [class*="bed"], [class*="bath"]');
|
||||
result.specs = Array.from(specs).map(s => s.innerText).slice(0, 5);
|
||||
|
||||
// Description
|
||||
const desc = document.querySelector('[data-testid="listing_description"], [class*="description"]');
|
||||
result.description = desc?.innerText?.substring(0, 500) || '';
|
||||
|
||||
// Agent
|
||||
const agent = document.querySelector('[data-testid="agent-details"], [class*="agent"]');
|
||||
result.agent = agent?.innerText?.substring(0, 200) || '';
|
||||
|
||||
// Full page text summary
|
||||
result.pageTitle = document.title;
|
||||
result.bodyPreview = document.body?.innerText?.substring(0, 1000);
|
||||
|
||||
return result;
|
||||
}""")
|
||||
|
||||
print(f"\n=== Detail Page Data ===")
|
||||
print(json.dumps(detail, indent=2, ensure_ascii=False)[:3000])
|
||||
|
||||
browser.close()
|
||||
|
||||
|
||||
def main():
|
||||
outcode = sys.argv[1] if len(sys.argv) > 1 else "E1"
|
||||
channel = "BUY"
|
||||
log.info("=== Zoopla Scraping Experiment (Playwright Stealth) ===")
|
||||
log.info("Outcode: %s, Channel: %s", outcode, channel)
|
||||
run_playwright_stealth(outcode, channel)
|
||||
log.info("=== Done ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue