1285 lines
45 KiB
Python
1285 lines
45 KiB
Python
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
|
|
|
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
|
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
|
|
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
|
|
|
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
|
result data is server-rendered in an RSC stream, not available via
|
|
__NEXT_DATA__ or a JSON API.
|
|
|
|
Architecture:
|
|
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
|
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
|
1. Navigates directly to the sale search URL
|
|
2. Extracts listing data from the rendered DOM
|
|
3. Handles pagination via ?pn=N parameter
|
|
|
|
The browser session replaces the cookie/client pattern used by other scrapers.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
|
|
import httpx
|
|
|
|
from constants import (
|
|
DATA_DIR,
|
|
DELAY_BETWEEN_PAGES,
|
|
MAX_BEDROOMS,
|
|
PROPERTY_TYPE_MAP,
|
|
ZOOPLA_BASE,
|
|
)
|
|
from spatial import PostcodeSpatialIndex
|
|
from transform import (
|
|
clean_listing_address,
|
|
extract_full_postcode,
|
|
normalize_sub_type,
|
|
parse_int_value,
|
|
validate_floor_area,
|
|
)
|
|
|
|
log = logging.getLogger("zoopla")
|
|
|
|
|
|
class TurnstileError(Exception):
|
|
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
|
|
|
|
|
def _pid_exists(pid: int) -> bool:
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
return True
|
|
|
|
|
|
def _proc_ppid(pid: int) -> int | None:
|
|
try:
|
|
for line in Path(f"/proc/{pid}/status").read_text().splitlines():
|
|
if line.startswith("PPid:"):
|
|
return int(line.split()[1])
|
|
except (OSError, ValueError):
|
|
return None
|
|
return None
|
|
|
|
|
|
def _proc_descendants(root_pid: int) -> set[int]:
|
|
proc_root = Path("/proc")
|
|
if not proc_root.exists():
|
|
return set()
|
|
|
|
children: dict[int, list[int]] = {}
|
|
for path in proc_root.iterdir():
|
|
if not path.name.isdigit():
|
|
continue
|
|
pid = int(path.name)
|
|
ppid = _proc_ppid(pid)
|
|
if ppid is not None:
|
|
children.setdefault(ppid, []).append(pid)
|
|
|
|
descendants: set[int] = set()
|
|
stack = list(children.get(root_pid, []))
|
|
while stack:
|
|
pid = stack.pop()
|
|
if pid in descendants:
|
|
continue
|
|
descendants.add(pid)
|
|
stack.extend(children.get(pid, []))
|
|
return descendants
|
|
|
|
|
|
def _terminate_process_tree(root_pid: int, label: str) -> None:
|
|
if root_pid <= 0 or root_pid == os.getpid():
|
|
return
|
|
|
|
pids = _proc_descendants(root_pid) | {root_pid}
|
|
for sig, sig_name, delay in (
|
|
(signal.SIGTERM, "SIGTERM", 1.0),
|
|
(signal.SIGKILL, "SIGKILL", 0.5),
|
|
):
|
|
alive = [pid for pid in sorted(pids, reverse=True) if _pid_exists(pid)]
|
|
if not alive:
|
|
return
|
|
log.warning("%s: sending %s to %d process(es)", label, sig_name, len(alive))
|
|
for pid in alive:
|
|
try:
|
|
os.kill(pid, sig)
|
|
except ProcessLookupError:
|
|
pass
|
|
except OSError as exc:
|
|
log.debug("%s: could not signal pid %d: %s", label, pid, exc)
|
|
time.sleep(delay)
|
|
|
|
alive = [pid for pid in sorted(pids) if _pid_exists(pid)]
|
|
if alive:
|
|
log.warning("%s: process(es) still alive after force close: %s", label, alive)
|
|
|
|
|
|
def _process_cmdline(pid: int) -> str:
|
|
try:
|
|
raw = Path(f"/proc/{pid}/cmdline").read_bytes()
|
|
except OSError:
|
|
return ""
|
|
return raw.replace(b"\0", b" ").decode(errors="replace")
|
|
|
|
|
|
def _profile_in_live_process(profile_dir: Path) -> bool:
|
|
proc_root = Path("/proc")
|
|
if not proc_root.exists():
|
|
return False
|
|
|
|
needle = str(profile_dir)
|
|
for path in proc_root.iterdir():
|
|
if path.name.isdigit() and needle in _process_cmdline(int(path.name)):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _remove_stale_profile_locks(profile_dir: Path) -> None:
|
|
if _profile_in_live_process(profile_dir):
|
|
return
|
|
|
|
for name in (".parentlock", "parent.lock", "lock"):
|
|
lock_path = profile_dir / name
|
|
try:
|
|
if lock_path.exists() or lock_path.is_symlink():
|
|
lock_path.unlink()
|
|
log.warning("Removed stale Zoopla profile lock: %s", lock_path)
|
|
except OSError as exc:
|
|
log.debug("Could not remove Zoopla profile lock %s: %s", lock_path, exc)
|
|
|
|
|
|
def _exception_detail(exc: BaseException) -> str:
|
|
detail = " ".join(str(exc).split())
|
|
if not detail:
|
|
detail = repr(exc)
|
|
return f"{type(exc).__name__}: {detail}"
|
|
|
|
|
|
class _ManagedCamoufoxBrowser:
|
|
def __init__(self, context_manager, browser):
|
|
self._context_manager = context_manager
|
|
self._browser = browser
|
|
self._closed = False
|
|
|
|
def close(self) -> None:
|
|
if self._closed:
|
|
return
|
|
self._closed = True
|
|
self._browser.close()
|
|
# Camoufox.__exit__ calls browser.close() itself. The context is already
|
|
# closed here, so clear it to avoid a second blocking close attempt.
|
|
self._context_manager.browser = None
|
|
self._context_manager.__exit__(None, None, None)
|
|
|
|
def force_close(self) -> None:
|
|
self._closed = True
|
|
pid = self._driver_pid()
|
|
if pid is None:
|
|
log.warning("Zoopla force-close requested but Playwright driver pid is unknown")
|
|
return
|
|
_terminate_process_tree(pid, "Zoopla browser force-close")
|
|
_remove_stale_profile_locks(_zoopla_profile_dir())
|
|
|
|
def _driver_pid(self) -> int | None:
|
|
connection = getattr(self._context_manager, "_connection", None)
|
|
transport = getattr(connection, "_transport", None)
|
|
proc = getattr(transport, "_proc", None)
|
|
pid = getattr(proc, "pid", None)
|
|
return pid if isinstance(pid, int) else None
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._browser, name)
|
|
|
|
|
|
# JavaScript to extract listings from the rendered DOM.
|
|
# Uses data-testid attributes as primary selectors (stable across deployments),
|
|
# then falls back to href-based link matching with parent-walking.
|
|
_EXTRACT_LISTINGS_JS = r"""() => {
|
|
const seen = new Set();
|
|
const results = [];
|
|
|
|
// Strategy 1: Use data-testid selectors (post-2025 redesign)
|
|
const listingCards = document.querySelectorAll(
|
|
'[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
|
|
);
|
|
|
|
for (const card of listingCards) {
|
|
const link = card.querySelector(
|
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
|
);
|
|
if (!link) continue;
|
|
|
|
const href = link.href;
|
|
const match = href.match(/\/details\/(\d+)\//);
|
|
if (!match) continue;
|
|
|
|
const id = match[1];
|
|
if (seen.has(id)) continue;
|
|
seen.add(id);
|
|
|
|
const text = card.innerText || '';
|
|
|
|
// Try data-testid price element first, then regex
|
|
const priceEl = card.querySelector('[data-testid="listing-price"]');
|
|
const priceText = priceEl ? priceEl.innerText : text;
|
|
const priceMatch = priceText.match(/\u00a3([\d,]+)/);
|
|
|
|
// Try address element first, then regex
|
|
const addressEl = card.querySelector('address');
|
|
let address = addressEl ? addressEl.innerText.trim() : '';
|
|
|
|
if (!address) {
|
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
for (const line of lines) {
|
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
|
address = line;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
|
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
|
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
|
|
|
let tenure = '';
|
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
|
|
|
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
|
let property_type = '';
|
|
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
|
if (ptMatch) property_type = ptMatch[1].trim();
|
|
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
|
|
|
// Keyword fallback when regex doesn't match current DOM format
|
|
if (!property_type) {
|
|
const lower = text.toLowerCase();
|
|
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
|
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
|
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
|
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
|
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
|
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
|
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
|
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
|
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
|
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
|
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
|
}
|
|
|
|
results.push({
|
|
id, url: href.replace(window.location.origin, ''),
|
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
|
price_text: priceText.trim(),
|
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
|
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
|
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
|
address, tenure, property_type,
|
|
});
|
|
}
|
|
|
|
// Strategy 2: Fall back to href-based link matching with parent-walking
|
|
if (results.length === 0) {
|
|
const links = Array.from(document.querySelectorAll(
|
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
|
));
|
|
|
|
for (const link of links) {
|
|
const href = link.href;
|
|
const match = href.match(/\/details\/(\d+)\//);
|
|
if (!match) continue;
|
|
|
|
const id = match[1];
|
|
if (seen.has(id)) continue;
|
|
seen.add(id);
|
|
|
|
let card = link;
|
|
for (let j = 0; j < 15; j++) {
|
|
card = card.parentElement;
|
|
if (!card) break;
|
|
const t = card.innerText || '';
|
|
if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
|
|
break;
|
|
}
|
|
}
|
|
if (!card) continue;
|
|
|
|
const text = card.innerText || '';
|
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
|
|
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
|
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
|
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
|
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
|
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
|
|
|
let address = '';
|
|
for (const line of lines) {
|
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
|
address = line;
|
|
break;
|
|
}
|
|
}
|
|
|
|
let tenure = '';
|
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
|
|
|
// Extract property type
|
|
let property_type = '';
|
|
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
|
if (ptMatch2) property_type = ptMatch2[1].trim();
|
|
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
|
|
|
// Keyword fallback when regex doesn't match current DOM format
|
|
if (!property_type) {
|
|
const lower = text.toLowerCase();
|
|
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
|
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
|
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
|
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
|
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
|
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
|
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
|
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
|
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
|
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
|
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
|
}
|
|
|
|
results.push({
|
|
id, url: href.replace(window.location.origin, ''),
|
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
|
price_text: priceText2.trim(),
|
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
|
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
|
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
|
address, tenure, property_type,
|
|
});
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}"""
|
|
|
|
# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
|
|
_DISMISS_COOKIES_JS = """() => {
|
|
const aside = document.querySelector('#usercentrics-cmp-ui');
|
|
if (aside && aside.shadowRoot) {
|
|
const btns = aside.shadowRoot.querySelectorAll('button');
|
|
for (const btn of btns) {
|
|
if (btn.innerText.includes('Accept')) { btn.click(); return true; }
|
|
}
|
|
}
|
|
if (aside) { aside.remove(); return true; }
|
|
return false;
|
|
}"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Browser lifecycle
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
|
|
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
|
|
|
|
|
|
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
|
|
raw = os.environ.get(name)
|
|
if raw is None:
|
|
return default
|
|
|
|
value = raw.strip().lower()
|
|
if value == "virtual":
|
|
return "virtual"
|
|
if value in _TRUE_ENV_VALUES:
|
|
return True
|
|
if value in _FALSE_ENV_VALUES:
|
|
return False
|
|
raise ValueError(
|
|
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
|
|
)
|
|
|
|
|
|
def _visible_display_available() -> bool:
|
|
if sys.platform.startswith("linux"):
|
|
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
|
|
return True
|
|
|
|
|
|
def _zoopla_headless_mode() -> bool | str:
|
|
# Prefer a visible browser by default so Cloudflare can be completed by the
|
|
# person running the scrape. In display-less Linux shells, keep startup
|
|
# headless and fail fast with an actionable error if a challenge appears.
|
|
default: bool | str = not _visible_display_available()
|
|
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
|
|
|
|
|
|
def _zoopla_profile_dir() -> Path:
|
|
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
|
|
if raw:
|
|
return Path(raw).expanduser().resolve()
|
|
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
|
|
|
|
|
|
def _challenge_timeout_seconds() -> int:
|
|
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
|
|
if raw is None:
|
|
return 300
|
|
try:
|
|
timeout = int(raw)
|
|
except ValueError as exc:
|
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
|
|
if timeout < 1:
|
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
|
|
return timeout
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gluetun IP rotation
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# When Cloudflare Turnstile fires mid-scrape, the cheapest unblocker is to
|
|
# swap the egress IP via Gluetun's HTTP control server. We stop and re-start
|
|
# the VPN, poll until the public IP changes, drop the stale cf_clearance
|
|
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
|
|
|
|
|
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
|
|
|
|
|
def _gluetun_base_url() -> str:
|
|
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
|
|
|
|
|
|
def _gluetun_api_key() -> str | None:
|
|
return _GLUETUN_API_KEY
|
|
|
|
|
|
def _gluetun_max_rotations() -> int:
|
|
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
|
|
try:
|
|
value = int(raw)
|
|
except ValueError as exc:
|
|
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
|
|
return max(value, 0)
|
|
|
|
|
|
def _gluetun_client() -> httpx.Client:
|
|
headers = {}
|
|
api_key = _gluetun_api_key()
|
|
if api_key:
|
|
headers["X-API-Key"] = api_key
|
|
return httpx.Client(headers=headers)
|
|
|
|
|
|
def _gluetun_public_ip(client: httpx.Client) -> str | None:
|
|
try:
|
|
resp = client.get(f"{_gluetun_base_url()}/v1/publicip/ip", timeout=5.0)
|
|
if resp.status_code != 200:
|
|
return None
|
|
data = resp.json()
|
|
except (httpx.HTTPError, ValueError):
|
|
return None
|
|
return data.get("public_ip") or data.get("ip")
|
|
|
|
|
|
def _gluetun_set_vpn_status(client: httpx.Client, status: str) -> bool:
|
|
"""PUT /v1/vpn/status with {'status': status}. Returns True on 2xx."""
|
|
try:
|
|
resp = client.put(
|
|
f"{_gluetun_base_url()}/v1/vpn/status",
|
|
json={"status": status},
|
|
timeout=15.0,
|
|
)
|
|
except httpx.HTTPError as exc:
|
|
log.warning("Gluetun vpn/status %s failed: %s", status, exc)
|
|
return False
|
|
if resp.status_code == 401:
|
|
log.warning(
|
|
"Gluetun vpn/status %s: 401 Unauthorized — the API key must be "
|
|
"authorised for 'PUT /v1/vpn/status' in Gluetun's auth config.toml",
|
|
status,
|
|
)
|
|
return False
|
|
if resp.status_code >= 400:
|
|
log.warning(
|
|
"Gluetun vpn/status %s returned HTTP %d: %s",
|
|
status, resp.status_code, resp.text[:200],
|
|
)
|
|
return False
|
|
return True
|
|
|
|
|
|
def _rotate_gluetun_ip(wait_seconds: int = 45) -> bool:
|
|
"""Restart Gluetun's VPN and wait for the public IP to change.
|
|
|
|
Returns True if a new IP was observed within wait_seconds."""
|
|
with _gluetun_client() as client:
|
|
old_ip = _gluetun_public_ip(client)
|
|
log.info("Requesting Gluetun IP rotation (current IP: %s)", old_ip or "unknown")
|
|
|
|
stop_attempted = False
|
|
restart_confirmed = False
|
|
try:
|
|
stop_attempted = True
|
|
if not _gluetun_set_vpn_status(client, "stopped"):
|
|
return False
|
|
time.sleep(2)
|
|
restart_confirmed = _gluetun_set_vpn_status(client, "running")
|
|
if not restart_confirmed:
|
|
return False
|
|
|
|
deadline = time.monotonic() + wait_seconds
|
|
while time.monotonic() < deadline:
|
|
time.sleep(2)
|
|
new_ip = _gluetun_public_ip(client)
|
|
if new_ip and new_ip != old_ip:
|
|
log.info("Gluetun rotated IP: %s -> %s", old_ip or "?", new_ip)
|
|
return True
|
|
finally:
|
|
if stop_attempted and not restart_confirmed:
|
|
log.warning(
|
|
"Gluetun VPN may be stopped after failed rotation; attempting recovery start"
|
|
)
|
|
if not _gluetun_set_vpn_status(client, "running"):
|
|
log.error(
|
|
"Gluetun VPN recovery start failed; manual intervention required"
|
|
)
|
|
|
|
log.warning("Gluetun IP did not change within %ds", wait_seconds)
|
|
return False
|
|
|
|
|
|
def _clear_cloudflare_cookies(page) -> None:
|
|
"""Drop cf_clearance / __cf_bm which are bound to the previous egress IP."""
|
|
try:
|
|
context = page.context
|
|
except Exception:
|
|
return
|
|
for name in ("cf_clearance", "__cf_bm"):
|
|
try:
|
|
context.clear_cookies(name=name)
|
|
except Exception as exc:
|
|
log.debug("Could not clear cookie %s: %s", name, exc)
|
|
|
|
|
|
def _rotate_and_retry_challenge(page, max_rotations: int) -> bool:
|
|
"""Rotate IP and reload until the challenge clears. Returns True on success."""
|
|
for attempt in range(1, max_rotations + 1):
|
|
log.warning(
|
|
"Cloudflare Turnstile challenge — rotating Gluetun IP (attempt %d/%d)",
|
|
attempt, max_rotations,
|
|
)
|
|
if not _rotate_gluetun_ip():
|
|
continue
|
|
|
|
_clear_cloudflare_cookies(page)
|
|
|
|
try:
|
|
page.reload(wait_until="domcontentloaded", timeout=30000)
|
|
except Exception as exc:
|
|
log.warning("Reload after IP rotation failed: %s", exc)
|
|
continue
|
|
|
|
time.sleep(2)
|
|
if not _is_turnstile_challenge(page):
|
|
log.info("Cloudflare challenge cleared after Gluetun rotation")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _is_turnstile_challenge(page) -> bool:
|
|
try:
|
|
if "just a moment" in page.title().lower():
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
return bool(
|
|
page.query_selector(
|
|
'iframe[src*="challenges.cloudflare.com"], '
|
|
'input[name="cf-turnstile-response"]'
|
|
)
|
|
)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
|
if not _is_turnstile_challenge(page):
|
|
return
|
|
|
|
# Try Gluetun IP rotation first — works in any mode and is the only option
|
|
# in headless/unattended runs where no human can click the challenge.
|
|
max_rotations = _gluetun_max_rotations()
|
|
if max_rotations > 0 and _rotate_and_retry_challenge(page, max_rotations):
|
|
return
|
|
|
|
profile_dir = _zoopla_profile_dir()
|
|
if headless_mode is True or headless_mode == "virtual":
|
|
raise TurnstileError(
|
|
"Cloudflare Turnstile persisted after "
|
|
f"{max_rotations} Gluetun IP rotation(s). "
|
|
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0 "
|
|
f"to solve manually; the session will be saved in {profile_dir}."
|
|
)
|
|
|
|
timeout = _challenge_timeout_seconds()
|
|
log.warning(
|
|
"Gluetun rotation insufficient — falling back to interactive solve. "
|
|
"Complete the Cloudflare challenge in the Zoopla browser window; "
|
|
"waiting up to %ds. Profile: %s",
|
|
timeout,
|
|
profile_dir,
|
|
)
|
|
try:
|
|
page.bring_to_front()
|
|
except Exception:
|
|
pass
|
|
|
|
deadline = time.monotonic() + timeout
|
|
while time.monotonic() < deadline:
|
|
time.sleep(3)
|
|
if not _is_turnstile_challenge(page):
|
|
log.info("Cloudflare challenge resolved")
|
|
return
|
|
|
|
raise TurnstileError(
|
|
f"Cloudflare Turnstile was not completed after {timeout}s"
|
|
)
|
|
|
|
|
|
def launch_browser():
|
|
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
|
and dismiss cookie consent. Returns (browser, page) tuple.
|
|
|
|
Raises TurnstileError if Cloudflare cannot be completed.
|
|
Caller must close browser when done."""
|
|
from camoufox.pkgman import camoufox_path
|
|
|
|
# Standalone local runs should not require the old container image to have
|
|
# pre-fetched Camoufox.
|
|
camoufox_path(download_if_missing=True)
|
|
|
|
from camoufox.sync_api import Camoufox
|
|
|
|
headless_mode = _zoopla_headless_mode()
|
|
profile_dir = _zoopla_profile_dir()
|
|
profile_dir.mkdir(parents=True, exist_ok=True)
|
|
_remove_stale_profile_locks(profile_dir)
|
|
|
|
log.info(
|
|
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
|
headless_mode,
|
|
profile_dir,
|
|
)
|
|
camoufox = Camoufox(
|
|
headless=headless_mode,
|
|
persistent_context=True,
|
|
user_data_dir=str(profile_dir),
|
|
locale=["en-GB", "en"],
|
|
enable_cache=True,
|
|
)
|
|
raw_browser = camoufox.__enter__()
|
|
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
|
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
|
|
|
|
try:
|
|
log.info("Navigating to Zoopla homepage...")
|
|
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
|
_wait_for_turnstile(page, headless_mode)
|
|
|
|
log.info("Zoopla browser ready — title: %s", page.title())
|
|
time.sleep(2)
|
|
|
|
# Dismiss cookie consent
|
|
page.evaluate(_DISMISS_COOKIES_JS)
|
|
time.sleep(1)
|
|
except Exception:
|
|
try:
|
|
page.close()
|
|
finally:
|
|
browser.close()
|
|
raise
|
|
|
|
return browser, page
|
|
|
|
|
|
def _ensure_not_challenged(page) -> None:
|
|
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
|
_wait_for_turnstile(page, _zoopla_headless_mode())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Search navigation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _wait_for_listing_content(page) -> None:
|
|
"""Wait for rendered listing cards to contain usable text."""
|
|
try:
|
|
page.wait_for_function(
|
|
"""() => {
|
|
const cards = document.querySelectorAll(
|
|
'[data-testid="regular-listings"] > div'
|
|
);
|
|
if (cards.length === 0) return false;
|
|
for (const card of cards) {
|
|
const t = card.innerText || '';
|
|
if (t.includes('\\u00a3') && t.length > 50) return true;
|
|
}
|
|
return false;
|
|
}""",
|
|
timeout=8000,
|
|
)
|
|
except Exception:
|
|
time.sleep(1.5)
|
|
|
|
|
|
def _navigate_search(page, outcode: str) -> bool:
|
|
"""Navigate directly to sale search results for an outcode.
|
|
|
|
Returns True if results were found, False if no results or navigation failed.
|
|
Raises TurnstileError if Cloudflare blocks us."""
|
|
url = (
|
|
f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
|
|
f"?q={outcode}&search_source=home"
|
|
)
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
except Exception as exc:
|
|
detail = _exception_detail(exc)
|
|
log.warning("Zoopla direct navigation failed for %s: %s", outcode, detail)
|
|
raise RuntimeError(
|
|
f"Zoopla direct navigation failed for {outcode}: {detail}"
|
|
) from exc
|
|
|
|
_ensure_not_challenged(page)
|
|
|
|
# Dismiss cookie consent (may reappear after navigation)
|
|
try:
|
|
page.evaluate(_DISMISS_COOKIES_JS)
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
page.wait_for_selector(
|
|
'[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
|
|
timeout=10000,
|
|
)
|
|
except Exception:
|
|
if not page.query_selector('a[href*="/details/"]'):
|
|
return False
|
|
|
|
_wait_for_listing_content(page)
|
|
|
|
return True
|
|
|
|
|
|
def _get_result_count(page) -> int:
|
|
"""Extract the total results count from the page.
|
|
|
|
Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
|
|
matching both "N results" and "N properties" patterns."""
|
|
try:
|
|
# Try the ZAD targeting JSON script tag first
|
|
count = page.evaluate("""() => {
|
|
const s = document.querySelector('#__ZAD_TARGETING__');
|
|
if (s) {
|
|
try {
|
|
const d = JSON.parse(s.textContent);
|
|
if (d.search_results_count != null) return d.search_results_count;
|
|
} catch(e) {}
|
|
}
|
|
return null;
|
|
}""")
|
|
if count is not None and count > 0:
|
|
return count
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
body = page.inner_text("body")
|
|
match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
|
|
if match:
|
|
return int(match.group(1).replace(",", ""))
|
|
except Exception:
|
|
pass
|
|
return 0
|
|
|
|
|
|
def _url_with_page(url: str, page_num: int) -> str:
|
|
parsed = urlparse(url)
|
|
query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"]
|
|
query.append(("pn", str(page_num)))
|
|
return urlunparse(parsed._replace(query=urlencode(query)))
|
|
|
|
|
|
def _find_next_page_url(page) -> str | None:
|
|
"""Return the rendered pagination next URL, if Zoopla exposes one."""
|
|
try:
|
|
href = page.evaluate(
|
|
"""() => {
|
|
const links = Array.from(document.querySelectorAll('a[href]'));
|
|
const next = links.find((link) => {
|
|
const text = (link.innerText || link.textContent || '')
|
|
.trim()
|
|
.toLowerCase();
|
|
const label = (link.getAttribute('aria-label') || '').toLowerCase();
|
|
const rel = (link.getAttribute('rel') || '').toLowerCase();
|
|
return rel.includes('next')
|
|
|| label.includes('next')
|
|
|| text === 'next'
|
|
|| text === 'next page';
|
|
});
|
|
if (!next) return null;
|
|
const href = next.href || '';
|
|
if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) {
|
|
return null;
|
|
}
|
|
return href;
|
|
}"""
|
|
)
|
|
except Exception as exc:
|
|
detail = _exception_detail(exc)
|
|
log.warning("Zoopla next-page detection failed: %s", detail)
|
|
raise RuntimeError(f"Zoopla next-page detection failed: {detail}") from exc
|
|
if not href:
|
|
return None
|
|
return urljoin(ZOOPLA_BASE, href)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Extraction and pagination
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_first_extraction_logged = False
|
|
|
|
|
|
def _extract_listings(page) -> list[dict]:
|
|
"""Extract listing data from the current search results page DOM."""
|
|
global _first_extraction_logged
|
|
try:
|
|
listings = page.evaluate(_EXTRACT_LISTINGS_JS)
|
|
|
|
# Log diagnostic info on the very first extraction attempt
|
|
if not _first_extraction_logged:
|
|
_first_extraction_logged = True
|
|
try:
|
|
diag = page.evaluate("""() => {
|
|
const details = document.querySelectorAll('a[href*="/details/"]');
|
|
const testids = document.querySelectorAll('[data-testid]');
|
|
const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
|
|
return {
|
|
url: location.href,
|
|
title: document.title,
|
|
detailLinks: details.length,
|
|
testids: testidNames.slice(0, 30),
|
|
bodySnippet: document.body?.innerText?.slice(0, 500) || '',
|
|
};
|
|
}""")
|
|
log.info(
|
|
"Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
|
|
"testids=%s bodySnippet=%.200s",
|
|
diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
|
|
diag.get("testids", []), diag.get("bodySnippet", ""),
|
|
)
|
|
except Exception:
|
|
pass
|
|
log.info("Zoopla first extraction: %d listings found", len(listings))
|
|
|
|
return listings
|
|
except Exception as e:
|
|
detail = _exception_detail(e)
|
|
log.warning("Failed to extract listings from DOM: %s", detail)
|
|
raise RuntimeError(f"Zoopla DOM extraction failed: {detail}") from e
|
|
|
|
|
|
def _paginate(
|
|
page,
|
|
total_results: int,
|
|
max_properties: int | None = None,
|
|
) -> list[dict]:
|
|
"""Extract listings from all pages of search results.
|
|
|
|
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
|
next link when present, otherwise advance via the pn=N URL parameter while
|
|
the advertised result count says more listings remain."""
|
|
all_listings = _extract_listings(page)
|
|
if max_properties is not None and len(all_listings) >= max_properties:
|
|
return all_listings[:max_properties]
|
|
|
|
if not all_listings:
|
|
return all_listings
|
|
|
|
seen_ids = {listing["id"] for listing in all_listings}
|
|
page_num = 2
|
|
|
|
while True:
|
|
next_url = _find_next_page_url(page)
|
|
if not next_url:
|
|
if total_results > 0 and len(all_listings) >= total_results:
|
|
break
|
|
next_url = _url_with_page(page.url, page_num)
|
|
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
try:
|
|
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
|
_ensure_not_challenged(page)
|
|
_wait_for_listing_content(page)
|
|
except TurnstileError:
|
|
raise
|
|
except Exception as e:
|
|
detail = _exception_detail(e)
|
|
log.warning(
|
|
"Zoopla pagination navigation failed at page %d: %s",
|
|
page_num,
|
|
detail,
|
|
)
|
|
raise RuntimeError(
|
|
f"Zoopla pagination navigation failed at page {page_num}: {detail}"
|
|
) from e
|
|
|
|
page_listings = _extract_listings(page)
|
|
if not page_listings:
|
|
if total_results > len(all_listings):
|
|
raise RuntimeError(
|
|
"Zoopla pagination stopped with no listings on page "
|
|
f"{page_num}; collected {len(all_listings)} of "
|
|
f"{total_results} advertised results"
|
|
)
|
|
break
|
|
|
|
# Deduplicate within this outcode
|
|
new_count = 0
|
|
for listing in page_listings:
|
|
if listing["id"] not in seen_ids:
|
|
seen_ids.add(listing["id"])
|
|
all_listings.append(listing)
|
|
new_count += 1
|
|
if max_properties is not None and len(all_listings) >= max_properties:
|
|
return all_listings[:max_properties]
|
|
|
|
if new_count == 0:
|
|
if total_results > len(all_listings):
|
|
raise RuntimeError(
|
|
"Zoopla pagination repeated results on page "
|
|
f"{page_num}; collected {len(all_listings)} of "
|
|
f"{total_results} advertised results"
|
|
)
|
|
break
|
|
|
|
page_num += 1
|
|
|
|
if total_results > 0 and len(all_listings) >= total_results:
|
|
if not _find_next_page_url(page):
|
|
break
|
|
|
|
return all_listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Property transformation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
|
|
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
|
|
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
|
|
|
|
|
|
def _resolve_outcode_coords(
|
|
outcode: str, pc_coords: dict[str, tuple[float, float]]
|
|
) -> tuple[str, float, float] | None:
|
|
"""Find first postcode + coords for an outcode. Result is cached."""
|
|
if outcode in _outcode_coords_cache:
|
|
return _outcode_coords_cache[outcode]
|
|
|
|
prefix = outcode + " "
|
|
for pcd, (lat, lng) in pc_coords.items():
|
|
if pcd.startswith(prefix) or (
|
|
len(outcode) >= 4
|
|
and pcd.startswith(outcode)
|
|
and len(pcd) > len(outcode)
|
|
):
|
|
_outcode_coords_cache[outcode] = (pcd, lat, lng)
|
|
return (pcd, lat, lng)
|
|
|
|
_outcode_coords_cache[outcode] = None
|
|
return None
|
|
|
|
|
|
def _extract_outcode(text: str) -> str | None:
|
|
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
|
|
# Look for outcode at end of string or after last comma
|
|
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper()
|
|
# Try after comma
|
|
parts = text.split(",")
|
|
if len(parts) > 1:
|
|
last = parts[-1].strip()
|
|
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper()
|
|
return None
|
|
|
|
|
|
def _map_property_type(raw_type: str | None) -> str:
|
|
"""Map Zoopla property type text to canonical type."""
|
|
if not raw_type:
|
|
return "Other"
|
|
# Exact match (handles Rightmove-style capitalised values)
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
|
if canonical:
|
|
return canonical
|
|
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
|
if canonical:
|
|
return canonical
|
|
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
|
if canonical:
|
|
return canonical
|
|
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
|
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
|
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
|
if canonical:
|
|
return canonical
|
|
# Keyword fallback
|
|
lower = raw_type.lower()
|
|
excluded_flat_like = (
|
|
"block of apartment",
|
|
"house of multiple occupation",
|
|
"private halls",
|
|
"retirement",
|
|
"serviced apartment",
|
|
)
|
|
if any(term in lower for term in excluded_flat_like):
|
|
return "Other"
|
|
if (
|
|
"flat" in lower
|
|
or "apartment" in lower
|
|
or "maisonette" in lower
|
|
or "studio" in lower
|
|
or "penthouse" in lower
|
|
):
|
|
return "Flats/Maisonettes"
|
|
if "semi" in lower and "detach" in lower:
|
|
return "Semi-Detached"
|
|
if "detach" in lower:
|
|
return "Detached"
|
|
if "terrace" in lower or "mews" in lower:
|
|
return "Terraced"
|
|
if "house" in lower:
|
|
return "Detached"
|
|
return "Other"
|
|
|
|
|
|
def transform_property(
|
|
raw: dict,
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
search_outcode: str | None = None,
|
|
) -> dict | None:
|
|
"""Transform a raw Zoopla listing dict into the standard output schema.
|
|
|
|
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
|
from postcodes extracted from the address text."""
|
|
price = parse_int_value(raw.get("price")) or 0
|
|
|
|
address = raw.get("address", "") or ""
|
|
|
|
# Resolve postcode and coordinates from address
|
|
extracted_postcode = extract_full_postcode(address)
|
|
postcode = extracted_postcode
|
|
postcode_source = "address" if extracted_postcode else None
|
|
lat = lng = None
|
|
|
|
if postcode:
|
|
coords = pc_coords.get(postcode)
|
|
if coords:
|
|
lat, lng = coords
|
|
|
|
if lat is None:
|
|
# Try outcode-level fallback from address text
|
|
addr_outcode = _extract_outcode(address)
|
|
if addr_outcode:
|
|
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
|
if result:
|
|
postcode, lat, lng = result
|
|
postcode_source = "address_outcode"
|
|
|
|
# Final fallback: use the outcode we know we're searching
|
|
if lat is None and search_outcode:
|
|
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
|
if result:
|
|
postcode, lat, lng = result
|
|
postcode_source = "search_outcode"
|
|
|
|
if lat is None or lng is None or not postcode:
|
|
return None
|
|
|
|
# Validate coordinates are in England
|
|
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
|
return None
|
|
|
|
raw_beds = parse_int_value(raw.get("beds")) or 0
|
|
raw_baths = parse_int_value(raw.get("baths")) or 0
|
|
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
|
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
|
log.warning(
|
|
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
|
raw.get("id", "?"), raw_beds, raw_baths,
|
|
)
|
|
receptions = raw.get("receptions") or 0
|
|
|
|
# Floor area: convert sq ft to sq m
|
|
floor_area_sqm = None
|
|
raw_sqm = raw.get("floor_area_sqm")
|
|
if raw_sqm:
|
|
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
|
|
else:
|
|
sqft = raw.get("floor_area_sqft")
|
|
if sqft:
|
|
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
|
|
|
|
listing_id = raw.get("id", "")
|
|
listing_url = raw.get("url", "")
|
|
if listing_url and not listing_url.startswith("http"):
|
|
listing_url = ZOOPLA_BASE + listing_url
|
|
|
|
return {
|
|
"id": f"zp_{listing_id}",
|
|
"Bedrooms": bedrooms,
|
|
"Bathrooms": bathrooms,
|
|
"Number of bedrooms & living rooms": bedrooms + receptions,
|
|
"lon": lng,
|
|
"lat": lat,
|
|
"Postcode": postcode,
|
|
"Postcode source": postcode_source or "unknown",
|
|
"Extracted postcode": extracted_postcode,
|
|
"Inferred postcode": postcode if postcode_source != "address" else None,
|
|
"Listing raw address": address,
|
|
"Address per Property Register": clean_listing_address(address),
|
|
"Leasehold/Freehold": raw.get("tenure") or None,
|
|
"Property type": _map_property_type(raw.get("property_type")),
|
|
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
|
"price": price,
|
|
"price_frequency": "",
|
|
"Price qualifier": "",
|
|
"Total floor area (sqm)": floor_area_sqm,
|
|
"Listing URL": listing_url,
|
|
"Listing features": [],
|
|
"first_visible_date": "",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Top-level search function (called by scraper.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def search_outcode(
|
|
page,
|
|
outcode: str,
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
max_properties: int | None = None,
|
|
) -> tuple[list[dict], str | None]:
|
|
"""Search Zoopla for properties in one outcode.
|
|
|
|
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
|
search flow, extracts listings from rendered DOM, and transforms to the
|
|
standard output schema.
|
|
|
|
Returns (properties, search_url).
|
|
|
|
Raises TurnstileError if Cloudflare blocks us mid-session.
|
|
"""
|
|
if not _navigate_search(page, outcode):
|
|
return [], None
|
|
|
|
total_results = _get_result_count(page)
|
|
|
|
# Always try extraction even if result count is 0 — the count regex may
|
|
# not match Zoopla's current text format, but listings may still be in DOM
|
|
raw_listings = _paginate(
|
|
page,
|
|
total_results,
|
|
max_properties=max_properties,
|
|
)
|
|
if not raw_listings:
|
|
if total_results > 0:
|
|
log.debug(
|
|
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
|
"DOM selectors may need updating",
|
|
outcode, "BUY", total_results,
|
|
)
|
|
return [], None
|
|
|
|
properties = []
|
|
dropped = 0
|
|
for raw in raw_listings:
|
|
try:
|
|
transformed = transform_property(
|
|
raw, pc_index, pc_coords, search_outcode=outcode
|
|
)
|
|
except Exception as exc:
|
|
log.warning(
|
|
"Zoopla %s property %s failed to transform: %s",
|
|
outcode,
|
|
raw.get("id", "?"),
|
|
exc,
|
|
)
|
|
transformed = None
|
|
if transformed:
|
|
properties.append(transformed)
|
|
else:
|
|
dropped += 1
|
|
|
|
if dropped and not properties:
|
|
# Log a sample raw listing to diagnose which fields are missing
|
|
sample = raw_listings[0] if raw_listings else {}
|
|
log.debug(
|
|
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
|
"(no postcode/coords). Sample raw: price=%s address=%r",
|
|
outcode, "BUY", len(raw_listings), dropped,
|
|
sample.get("price"), sample.get("address", ""),
|
|
)
|
|
elif dropped > len(raw_listings) // 2:
|
|
log.debug(
|
|
"Zoopla %s %s: %d/%d listings dropped in transform",
|
|
outcode, "BUY", dropped, len(raw_listings),
|
|
)
|
|
|
|
return properties, page.url
|