1608 lines
60 KiB
Python
1608 lines
60 KiB
Python
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
|
|
|
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
|
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
|
|
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
|
|
|
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
|
result data is server-rendered in an RSC stream, not available via
|
|
__NEXT_DATA__ or a JSON API.
|
|
|
|
Architecture:
|
|
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
|
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
|
1. Navigates directly to the sale search URL
|
|
2. Extracts listing data from the rendered DOM
|
|
3. Handles pagination via ?pn=N parameter
|
|
|
|
The browser session replaces the cookie/client pattern used by other scrapers.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
|
|
import httpx
|
|
|
|
from constants import (
|
|
DATA_DIR,
|
|
DELAY_BETWEEN_PAGES,
|
|
GLUETUN_API_KEY,
|
|
GLUETUN_CONTROL_URL,
|
|
GLUETUN_MAX_ROTATIONS,
|
|
GLUETUN_PROXY,
|
|
MAX_BEDROOMS,
|
|
PROPERTY_TYPE_MAP,
|
|
ZOOPLA_BASE,
|
|
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
|
|
)
|
|
from spatial import PostcodeSpatialIndex
|
|
from transform import (
|
|
build_register_address,
|
|
extract_full_postcode,
|
|
extract_outcode,
|
|
fix_coords,
|
|
normalize_sub_type,
|
|
parse_int_value,
|
|
resolve_listing_postcode,
|
|
validate_floor_area,
|
|
)
|
|
|
|
log = logging.getLogger("zoopla")
|
|
|
|
|
|
class TurnstileError(Exception):
|
|
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
|
|
|
|
|
def _pid_exists(pid: int) -> bool:
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
return True
|
|
|
|
|
|
def _proc_ppid(pid: int) -> int | None:
|
|
try:
|
|
for line in Path(f"/proc/{pid}/status").read_text().splitlines():
|
|
if line.startswith("PPid:"):
|
|
return int(line.split()[1])
|
|
except (OSError, ValueError):
|
|
return None
|
|
return None
|
|
|
|
|
|
def _proc_descendants(root_pid: int) -> set[int]:
|
|
proc_root = Path("/proc")
|
|
if not proc_root.exists():
|
|
return set()
|
|
|
|
children: dict[int, list[int]] = {}
|
|
for path in proc_root.iterdir():
|
|
if not path.name.isdigit():
|
|
continue
|
|
pid = int(path.name)
|
|
ppid = _proc_ppid(pid)
|
|
if ppid is not None:
|
|
children.setdefault(ppid, []).append(pid)
|
|
|
|
descendants: set[int] = set()
|
|
stack = list(children.get(root_pid, []))
|
|
while stack:
|
|
pid = stack.pop()
|
|
if pid in descendants:
|
|
continue
|
|
descendants.add(pid)
|
|
stack.extend(children.get(pid, []))
|
|
return descendants
|
|
|
|
|
|
def _terminate_process_tree(root_pid: int, label: str) -> None:
|
|
if root_pid <= 0 or root_pid == os.getpid():
|
|
return
|
|
|
|
pids = _proc_descendants(root_pid) | {root_pid}
|
|
for sig, sig_name, delay in (
|
|
(signal.SIGTERM, "SIGTERM", 1.0),
|
|
(signal.SIGKILL, "SIGKILL", 0.5),
|
|
):
|
|
alive = [pid for pid in sorted(pids, reverse=True) if _pid_exists(pid)]
|
|
if not alive:
|
|
return
|
|
log.warning("%s: sending %s to %d process(es)", label, sig_name, len(alive))
|
|
for pid in alive:
|
|
try:
|
|
os.kill(pid, sig)
|
|
except ProcessLookupError:
|
|
pass
|
|
except OSError as exc:
|
|
log.debug("%s: could not signal pid %d: %s", label, pid, exc)
|
|
time.sleep(delay)
|
|
|
|
alive = [pid for pid in sorted(pids) if _pid_exists(pid)]
|
|
if alive:
|
|
log.warning("%s: process(es) still alive after force close: %s", label, alive)
|
|
|
|
|
|
def _process_cmdline(pid: int) -> str:
|
|
try:
|
|
raw = Path(f"/proc/{pid}/cmdline").read_bytes()
|
|
except OSError:
|
|
return ""
|
|
return raw.replace(b"\0", b" ").decode(errors="replace")
|
|
|
|
|
|
def _profile_in_live_process(profile_dir: Path) -> bool:
|
|
proc_root = Path("/proc")
|
|
if not proc_root.exists():
|
|
return False
|
|
|
|
needle = str(profile_dir)
|
|
for path in proc_root.iterdir():
|
|
if path.name.isdigit() and needle in _process_cmdline(int(path.name)):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _remove_stale_profile_locks(profile_dir: Path) -> None:
|
|
if _profile_in_live_process(profile_dir):
|
|
return
|
|
|
|
for name in (".parentlock", "parent.lock", "lock"):
|
|
lock_path = profile_dir / name
|
|
try:
|
|
if lock_path.exists() or lock_path.is_symlink():
|
|
lock_path.unlink()
|
|
log.warning("Removed stale Zoopla profile lock: %s", lock_path)
|
|
except OSError as exc:
|
|
log.debug("Could not remove Zoopla profile lock %s: %s", lock_path, exc)
|
|
|
|
|
|
def _exception_detail(exc: BaseException) -> str:
|
|
detail = " ".join(str(exc).split())
|
|
if not detail:
|
|
detail = repr(exc)
|
|
return f"{type(exc).__name__}: {detail}"
|
|
|
|
|
|
class _ManagedCamoufoxBrowser:
|
|
def __init__(self, context_manager, browser):
|
|
self._context_manager = context_manager
|
|
self._browser = browser
|
|
self._closed = False
|
|
|
|
def close(self) -> None:
|
|
if self._closed:
|
|
return
|
|
self._closed = True
|
|
self._browser.close()
|
|
# Camoufox.__exit__ calls browser.close() itself. The context is already
|
|
# closed here, so clear it to avoid a second blocking close attempt.
|
|
self._context_manager.browser = None
|
|
self._context_manager.__exit__(None, None, None)
|
|
|
|
def force_close(self) -> None:
|
|
self._closed = True
|
|
pid = self._driver_pid()
|
|
if pid is None:
|
|
log.warning("Zoopla force-close requested but Playwright driver pid is unknown")
|
|
return
|
|
_terminate_process_tree(pid, "Zoopla browser force-close")
|
|
_remove_stale_profile_locks(_zoopla_profile_dir())
|
|
|
|
def _driver_pid(self) -> int | None:
|
|
connection = getattr(self._context_manager, "_connection", None)
|
|
transport = getattr(connection, "_transport", None)
|
|
proc = getattr(transport, "_proc", None)
|
|
pid = getattr(proc, "pid", None)
|
|
return pid if isinstance(pid, int) else None
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._browser, name)
|
|
|
|
|
|
# JavaScript to extract listings from the rendered DOM.
|
|
# Uses data-testid attributes as primary selectors (stable across deployments),
|
|
# then falls back to href-based link matching with parent-walking.
|
|
_EXTRACT_LISTINGS_JS = r"""() => {
|
|
const seen = new Set();
|
|
const results = [];
|
|
|
|
// Strategy 1: Use data-testid selectors (post-2025 redesign)
|
|
const listingCards = document.querySelectorAll(
|
|
'[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
|
|
);
|
|
|
|
for (const card of listingCards) {
|
|
const link = card.querySelector(
|
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
|
);
|
|
if (!link) continue;
|
|
|
|
const href = link.href;
|
|
const match = href.match(/\/details\/(\d+)\//);
|
|
if (!match) continue;
|
|
|
|
const id = match[1];
|
|
if (seen.has(id)) continue;
|
|
seen.add(id);
|
|
|
|
const text = card.innerText || '';
|
|
|
|
// Try data-testid price element first, then regex
|
|
const priceEl = card.querySelector('[data-testid="listing-price"]');
|
|
const priceText = priceEl ? priceEl.innerText : text;
|
|
const priceMatch = priceText.match(/\u00a3([\d,]+)/);
|
|
|
|
// Try address element first, then regex
|
|
const addressEl = card.querySelector('address');
|
|
let address = addressEl ? addressEl.innerText.trim() : '';
|
|
|
|
if (!address) {
|
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
for (const line of lines) {
|
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
|
address = line;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
|
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
|
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
|
|
|
let tenure = '';
|
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
|
|
|
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
|
let property_type = '';
|
|
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
|
if (ptMatch) property_type = ptMatch[1].trim();
|
|
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
|
|
|
// Keyword fallback when regex doesn't match current DOM format
|
|
if (!property_type) {
|
|
const lower = text.toLowerCase();
|
|
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
|
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
|
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
|
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
|
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
|
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
|
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
|
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
|
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
|
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
|
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
|
}
|
|
|
|
results.push({
|
|
id, url: href.replace(window.location.origin, ''),
|
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
|
price_text: priceText.trim(),
|
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
|
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
|
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
|
address, tenure, property_type,
|
|
});
|
|
}
|
|
|
|
// Strategy 2: Fall back to href-based link matching with parent-walking
|
|
if (results.length === 0) {
|
|
const links = Array.from(document.querySelectorAll(
|
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
|
));
|
|
|
|
for (const link of links) {
|
|
const href = link.href;
|
|
const match = href.match(/\/details\/(\d+)\//);
|
|
if (!match) continue;
|
|
|
|
const id = match[1];
|
|
if (seen.has(id)) continue;
|
|
seen.add(id);
|
|
|
|
let card = link;
|
|
for (let j = 0; j < 15; j++) {
|
|
card = card.parentElement;
|
|
if (!card) break;
|
|
const t = card.innerText || '';
|
|
if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
|
|
break;
|
|
}
|
|
}
|
|
if (!card) continue;
|
|
|
|
const text = card.innerText || '';
|
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
|
|
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
|
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
|
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
|
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
|
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
|
|
|
let address = '';
|
|
for (const line of lines) {
|
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
|
address = line;
|
|
break;
|
|
}
|
|
}
|
|
|
|
let tenure = '';
|
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
|
|
|
// Extract property type
|
|
let property_type = '';
|
|
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
|
if (ptMatch2) property_type = ptMatch2[1].trim();
|
|
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
|
|
|
// Keyword fallback when regex doesn't match current DOM format
|
|
if (!property_type) {
|
|
const lower = text.toLowerCase();
|
|
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
|
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
|
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
|
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
|
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
|
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
|
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
|
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
|
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
|
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
|
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
|
}
|
|
|
|
results.push({
|
|
id, url: href.replace(window.location.origin, ''),
|
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
|
price_text: priceText2.trim(),
|
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
|
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
|
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
|
address, tenure, property_type,
|
|
});
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}"""
|
|
|
|
# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
|
|
_DISMISS_COOKIES_JS = """() => {
|
|
const aside = document.querySelector('#usercentrics-cmp-ui');
|
|
if (aside && aside.shadowRoot) {
|
|
const btns = aside.shadowRoot.querySelectorAll('button');
|
|
for (const btn of btns) {
|
|
if (btn.innerText.includes('Accept')) { btn.click(); return true; }
|
|
}
|
|
}
|
|
if (aside) { aside.remove(); return true; }
|
|
return false;
|
|
}"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Browser lifecycle
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
|
|
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
|
|
|
|
|
|
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
|
|
raw = os.environ.get(name)
|
|
if raw is None:
|
|
return default
|
|
|
|
value = raw.strip().lower()
|
|
if value == "virtual":
|
|
return "virtual"
|
|
if value in _TRUE_ENV_VALUES:
|
|
return True
|
|
if value in _FALSE_ENV_VALUES:
|
|
return False
|
|
raise ValueError(
|
|
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
|
|
)
|
|
|
|
|
|
def _visible_display_available() -> bool:
|
|
if sys.platform.startswith("linux"):
|
|
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
|
|
return True
|
|
|
|
|
|
def _zoopla_headless_mode() -> bool | str:
|
|
# Prefer a visible browser by default so Cloudflare can be completed by the
|
|
# person running the scrape. In display-less Linux shells, keep startup
|
|
# headless and fail fast with an actionable error if a challenge appears.
|
|
default: bool | str = not _visible_display_available()
|
|
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
|
|
|
|
|
|
def _zoopla_profile_dir() -> Path:
|
|
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
|
|
if raw:
|
|
return Path(raw).expanduser().resolve()
|
|
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
|
|
|
|
|
|
def _challenge_timeout_seconds() -> int:
|
|
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
|
|
if raw is None:
|
|
return 300
|
|
try:
|
|
timeout = int(raw)
|
|
except ValueError as exc:
|
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
|
|
if timeout < 1:
|
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
|
|
return timeout
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gluetun IP rotation
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# When Cloudflare Turnstile fires mid-scrape, the cheapest unblocker is to
|
|
# swap the egress IP via Gluetun's HTTP control server. We stop and re-start
|
|
# the VPN, poll until the public IP changes, drop the stale cf_clearance
|
|
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
|
|
|
|
|
def _gluetun_base_url() -> str:
|
|
return GLUETUN_CONTROL_URL.rstrip("/")
|
|
|
|
|
|
def _gluetun_api_key() -> str | None:
|
|
return GLUETUN_API_KEY
|
|
|
|
|
|
def _gluetun_max_rotations() -> int:
|
|
return max(GLUETUN_MAX_ROTATIONS, 0)
|
|
|
|
|
|
def _gluetun_client() -> httpx.Client:
|
|
# Talks to the control server directly (not through the VPN proxy).
|
|
headers = {}
|
|
api_key = _gluetun_api_key()
|
|
if api_key:
|
|
headers["X-API-Key"] = api_key
|
|
return httpx.Client(headers=headers)
|
|
|
|
|
|
def _gluetun_public_ip(client: httpx.Client) -> str | None:
|
|
try:
|
|
resp = client.get(f"{_gluetun_base_url()}/v1/publicip/ip", timeout=5.0)
|
|
if resp.status_code != 200:
|
|
return None
|
|
data = resp.json()
|
|
except (httpx.HTTPError, ValueError):
|
|
return None
|
|
return data.get("public_ip") or data.get("ip")
|
|
|
|
|
|
def _gluetun_set_vpn_status(client: httpx.Client, status: str) -> bool:
|
|
"""PUT /v1/vpn/status with {'status': status}. Returns True on 2xx."""
|
|
try:
|
|
resp = client.put(
|
|
f"{_gluetun_base_url()}/v1/vpn/status",
|
|
json={"status": status},
|
|
timeout=15.0,
|
|
)
|
|
except httpx.HTTPError as exc:
|
|
log.warning("Gluetun vpn/status %s failed: %s", status, exc)
|
|
return False
|
|
if resp.status_code == 401:
|
|
log.warning(
|
|
"Gluetun vpn/status %s: 401 Unauthorized — the API key must be "
|
|
"authorised for 'PUT /v1/vpn/status' in Gluetun's auth config.toml",
|
|
status,
|
|
)
|
|
return False
|
|
if resp.status_code >= 400:
|
|
log.warning(
|
|
"Gluetun vpn/status %s returned HTTP %d: %s",
|
|
status, resp.status_code, resp.text[:200],
|
|
)
|
|
return False
|
|
return True
|
|
|
|
|
|
def _rotate_gluetun_ip(wait_seconds: int = 45) -> bool:
|
|
"""Restart Gluetun's VPN and wait for the public IP to change.
|
|
|
|
Returns True if a new IP was observed within wait_seconds."""
|
|
with _gluetun_client() as client:
|
|
old_ip = _gluetun_public_ip(client)
|
|
log.info("Requesting Gluetun IP rotation (current IP: %s)", old_ip or "unknown")
|
|
|
|
stop_attempted = False
|
|
restart_confirmed = False
|
|
try:
|
|
stop_attempted = True
|
|
if not _gluetun_set_vpn_status(client, "stopped"):
|
|
return False
|
|
time.sleep(2)
|
|
restart_confirmed = _gluetun_set_vpn_status(client, "running")
|
|
if not restart_confirmed:
|
|
return False
|
|
|
|
deadline = time.monotonic() + wait_seconds
|
|
while time.monotonic() < deadline:
|
|
time.sleep(2)
|
|
new_ip = _gluetun_public_ip(client)
|
|
if new_ip and new_ip != old_ip:
|
|
log.info("Gluetun rotated IP: %s -> %s", old_ip or "?", new_ip)
|
|
return True
|
|
finally:
|
|
if stop_attempted and not restart_confirmed:
|
|
log.warning(
|
|
"Gluetun VPN may be stopped after failed rotation; attempting recovery start"
|
|
)
|
|
if not _gluetun_set_vpn_status(client, "running"):
|
|
log.error(
|
|
"Gluetun VPN recovery start failed; manual intervention required"
|
|
)
|
|
|
|
log.warning("Gluetun IP did not change within %ds", wait_seconds)
|
|
return False
|
|
|
|
|
|
def _clear_cloudflare_cookies(page) -> None:
|
|
"""Drop cf_clearance / __cf_bm which are bound to the previous egress IP."""
|
|
try:
|
|
context = page.context
|
|
except Exception:
|
|
return
|
|
for name in ("cf_clearance", "__cf_bm"):
|
|
try:
|
|
context.clear_cookies(name=name)
|
|
except Exception as exc:
|
|
log.debug("Could not clear cookie %s: %s", name, exc)
|
|
|
|
|
|
def _rotate_and_retry_challenge(page, max_rotations: int) -> bool:
|
|
"""Rotate IP and reload until the challenge clears. Returns True on success."""
|
|
for attempt in range(1, max_rotations + 1):
|
|
log.warning(
|
|
"Cloudflare Turnstile challenge — rotating Gluetun IP (attempt %d/%d)",
|
|
attempt, max_rotations,
|
|
)
|
|
if not _rotate_gluetun_ip():
|
|
continue
|
|
|
|
_clear_cloudflare_cookies(page)
|
|
|
|
try:
|
|
page.reload(wait_until="domcontentloaded", timeout=30000)
|
|
except Exception as exc:
|
|
log.warning("Reload after IP rotation failed: %s", exc)
|
|
continue
|
|
|
|
time.sleep(2)
|
|
if not _is_turnstile_challenge(page):
|
|
log.info("Cloudflare challenge cleared after Gluetun rotation")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _is_turnstile_challenge(page) -> bool:
|
|
try:
|
|
if "just a moment" in page.title().lower():
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
return bool(
|
|
page.query_selector(
|
|
'iframe[src*="challenges.cloudflare.com"], '
|
|
'input[name="cf-turnstile-response"]'
|
|
)
|
|
)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
|
if not _is_turnstile_challenge(page):
|
|
return
|
|
|
|
# Try Gluetun IP rotation first — works in any mode and is the only option
|
|
# in headless/unattended runs where no human can click the challenge.
|
|
max_rotations = _gluetun_max_rotations()
|
|
if max_rotations > 0 and _rotate_and_retry_challenge(page, max_rotations):
|
|
return
|
|
|
|
profile_dir = _zoopla_profile_dir()
|
|
if headless_mode is True or headless_mode == "virtual":
|
|
raise TurnstileError(
|
|
"Cloudflare Turnstile persisted after "
|
|
f"{max_rotations} Gluetun IP rotation(s). "
|
|
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0 "
|
|
f"to solve manually; the session will be saved in {profile_dir}."
|
|
)
|
|
|
|
timeout = _challenge_timeout_seconds()
|
|
log.warning(
|
|
"Gluetun rotation insufficient — falling back to interactive solve. "
|
|
"Complete the Cloudflare challenge in the Zoopla browser window; "
|
|
"waiting up to %ds. Profile: %s",
|
|
timeout,
|
|
profile_dir,
|
|
)
|
|
try:
|
|
page.bring_to_front()
|
|
except Exception:
|
|
pass
|
|
|
|
deadline = time.monotonic() + timeout
|
|
while time.monotonic() < deadline:
|
|
time.sleep(3)
|
|
if not _is_turnstile_challenge(page):
|
|
log.info("Cloudflare challenge resolved")
|
|
return
|
|
|
|
raise TurnstileError(
|
|
f"Cloudflare Turnstile was not completed after {timeout}s"
|
|
)
|
|
|
|
|
|
def launch_browser():
|
|
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
|
and dismiss cookie consent. Returns (browser, page) tuple.
|
|
|
|
Raises TurnstileError if Cloudflare cannot be completed.
|
|
Caller must close browser when done."""
|
|
from camoufox.pkgman import camoufox_path
|
|
|
|
# Standalone local runs should not require the old container image to have
|
|
# pre-fetched Camoufox.
|
|
camoufox_path(download_if_missing=True)
|
|
|
|
from camoufox.sync_api import Camoufox
|
|
|
|
headless_mode = _zoopla_headless_mode()
|
|
profile_dir = _zoopla_profile_dir()
|
|
profile_dir.mkdir(parents=True, exist_ok=True)
|
|
_remove_stale_profile_locks(profile_dir)
|
|
|
|
# Route the browser through the Gluetun VPN proxy when configured. (geoip
|
|
# fingerprint alignment is intentionally not enabled: it needs the optional
|
|
# camoufox[geoip] extra and would spoof to the VPN exit's country, which
|
|
# fights the en-GB locale unless the exit is in the UK.)
|
|
proxy_options: dict = {}
|
|
if GLUETUN_PROXY:
|
|
proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
|
|
|
|
log.info(
|
|
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
|
|
headless_mode,
|
|
profile_dir,
|
|
GLUETUN_PROXY or "direct",
|
|
)
|
|
camoufox = Camoufox(
|
|
headless=headless_mode,
|
|
persistent_context=True,
|
|
user_data_dir=str(profile_dir),
|
|
locale=["en-GB", "en"],
|
|
enable_cache=True,
|
|
**proxy_options,
|
|
)
|
|
raw_browser = camoufox.__enter__()
|
|
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
|
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
|
|
|
|
try:
|
|
log.info("Navigating to Zoopla homepage...")
|
|
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
|
_wait_for_turnstile(page, headless_mode)
|
|
|
|
log.info("Zoopla browser ready — title: %s", page.title())
|
|
time.sleep(2)
|
|
|
|
# Dismiss cookie consent
|
|
page.evaluate(_DISMISS_COOKIES_JS)
|
|
time.sleep(1)
|
|
except Exception:
|
|
try:
|
|
page.close()
|
|
finally:
|
|
browser.close()
|
|
raise
|
|
|
|
return browser, page
|
|
|
|
|
|
def _ensure_not_challenged(page) -> None:
|
|
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
|
_wait_for_turnstile(page, _zoopla_headless_mode())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Search navigation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _wait_for_listing_content(page) -> None:
|
|
"""Wait for rendered listing cards to contain usable text."""
|
|
try:
|
|
page.wait_for_function(
|
|
"""() => {
|
|
const cards = document.querySelectorAll(
|
|
'[data-testid="regular-listings"] > div'
|
|
);
|
|
if (cards.length === 0) return false;
|
|
for (const card of cards) {
|
|
const t = card.innerText || '';
|
|
if (t.includes('\\u00a3') && t.length > 50) return true;
|
|
}
|
|
return false;
|
|
}""",
|
|
timeout=8000,
|
|
)
|
|
except Exception:
|
|
time.sleep(1.5)
|
|
|
|
|
|
def _navigate_search(page, outcode: str) -> bool:
|
|
"""Navigate directly to sale search results for an outcode.
|
|
|
|
Returns True if results were found, False if no results or navigation failed.
|
|
Raises TurnstileError if Cloudflare blocks us."""
|
|
url = (
|
|
f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
|
|
f"?q={outcode}&search_source=home"
|
|
)
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
except Exception as exc:
|
|
detail = _exception_detail(exc)
|
|
log.warning("Zoopla direct navigation failed for %s: %s", outcode, detail)
|
|
raise RuntimeError(
|
|
f"Zoopla direct navigation failed for {outcode}: {detail}"
|
|
) from exc
|
|
|
|
_ensure_not_challenged(page)
|
|
|
|
# Dismiss cookie consent (may reappear after navigation)
|
|
try:
|
|
page.evaluate(_DISMISS_COOKIES_JS)
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
page.wait_for_selector(
|
|
'[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
|
|
timeout=10000,
|
|
)
|
|
except Exception:
|
|
if not page.query_selector('a[href*="/details/"]'):
|
|
return False
|
|
|
|
_wait_for_listing_content(page)
|
|
|
|
return True
|
|
|
|
|
|
def _get_result_count(page) -> int:
|
|
"""Extract the total results count from the page.
|
|
|
|
Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
|
|
matching both "N results" and "N properties" patterns."""
|
|
try:
|
|
# Try the ZAD targeting JSON script tag first
|
|
count = page.evaluate("""() => {
|
|
const s = document.querySelector('#__ZAD_TARGETING__');
|
|
if (s) {
|
|
try {
|
|
const d = JSON.parse(s.textContent);
|
|
if (d.search_results_count != null) return d.search_results_count;
|
|
} catch(e) {}
|
|
}
|
|
return null;
|
|
}""")
|
|
if count is not None and count > 0:
|
|
return count
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
body = page.inner_text("body")
|
|
match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
|
|
if match:
|
|
return int(match.group(1).replace(",", ""))
|
|
except Exception:
|
|
pass
|
|
return 0
|
|
|
|
|
|
def _url_with_page(url: str, page_num: int) -> str:
|
|
parsed = urlparse(url)
|
|
query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"]
|
|
query.append(("pn", str(page_num)))
|
|
return urlunparse(parsed._replace(query=urlencode(query)))
|
|
|
|
|
|
def _find_next_page_url(page) -> str | None:
|
|
"""Return the rendered pagination next URL, if Zoopla exposes one."""
|
|
try:
|
|
href = page.evaluate(
|
|
"""() => {
|
|
const links = Array.from(document.querySelectorAll('a[href]'));
|
|
const next = links.find((link) => {
|
|
const text = (link.innerText || link.textContent || '')
|
|
.trim()
|
|
.toLowerCase();
|
|
const label = (link.getAttribute('aria-label') || '').toLowerCase();
|
|
const rel = (link.getAttribute('rel') || '').toLowerCase();
|
|
return rel.includes('next')
|
|
|| label.includes('next')
|
|
|| text === 'next'
|
|
|| text === 'next page';
|
|
});
|
|
if (!next) return null;
|
|
const href = next.href || '';
|
|
if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) {
|
|
return null;
|
|
}
|
|
return href;
|
|
}"""
|
|
)
|
|
except Exception as exc:
|
|
detail = _exception_detail(exc)
|
|
log.warning("Zoopla next-page detection failed: %s", detail)
|
|
raise RuntimeError(f"Zoopla next-page detection failed: {detail}") from exc
|
|
if not href:
|
|
return None
|
|
return urljoin(ZOOPLA_BASE, href)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Extraction and pagination
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_first_extraction_logged = False
|
|
|
|
|
|
def _extract_listings(page) -> list[dict]:
|
|
"""Extract listing data from the current search results page DOM."""
|
|
global _first_extraction_logged
|
|
try:
|
|
listings = page.evaluate(_EXTRACT_LISTINGS_JS)
|
|
|
|
# Log diagnostic info on the very first extraction attempt
|
|
if not _first_extraction_logged:
|
|
_first_extraction_logged = True
|
|
try:
|
|
diag = page.evaluate("""() => {
|
|
const details = document.querySelectorAll('a[href*="/details/"]');
|
|
const testids = document.querySelectorAll('[data-testid]');
|
|
const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
|
|
return {
|
|
url: location.href,
|
|
title: document.title,
|
|
detailLinks: details.length,
|
|
testids: testidNames.slice(0, 30),
|
|
bodySnippet: document.body?.innerText?.slice(0, 500) || '',
|
|
};
|
|
}""")
|
|
log.info(
|
|
"Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
|
|
"testids=%s bodySnippet=%.200s",
|
|
diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
|
|
diag.get("testids", []), diag.get("bodySnippet", ""),
|
|
)
|
|
except Exception:
|
|
pass
|
|
log.info("Zoopla first extraction: %d listings found", len(listings))
|
|
|
|
return listings
|
|
except Exception as e:
|
|
detail = _exception_detail(e)
|
|
log.warning("Failed to extract listings from DOM: %s", detail)
|
|
raise RuntimeError(f"Zoopla DOM extraction failed: {detail}") from e
|
|
|
|
|
|
def _paginate(
|
|
page,
|
|
total_results: int,
|
|
max_properties: int | None = None,
|
|
fetch_detail=None,
|
|
detail_cap: int = 0,
|
|
detail_state: dict | None = None,
|
|
detail_deadline: float | None = None,
|
|
) -> list[dict]:
|
|
"""Extract listings from all pages of search results.
|
|
|
|
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
|
next link when present, otherwise advance via the pn=N URL parameter while
|
|
the advertised result count says more listings remain.
|
|
|
|
When ``fetch_detail`` is supplied, each listing has its detail page fetched
|
|
(up to ``detail_cap`` fresh loads per outcode, counted in the shared
|
|
``detail_state`` dict, and only until ``detail_deadline``) and the parsed
|
|
geo stored under ``listing['_detail']`` for ``transform_property``. The
|
|
detail page is the only source of the listing's UPRN, full street address
|
|
and precise postcode, so it is fetched even when the search card already
|
|
pins a full postcode. Cached detail results are always attached but cost
|
|
neither a cap slot nor a delay."""
|
|
|
|
def _maybe_fetch(listing: dict) -> None:
|
|
if fetch_detail is None or detail_state is None:
|
|
return
|
|
url = listing.get("url", "")
|
|
cached = _detail_cache_key(url) in _detail_cache
|
|
if not cached:
|
|
# Fresh loads are bounded by the per-outcode cap and the wall-clock
|
|
# deadline so detail fetching never starves the SIGALRM budget that
|
|
# also guards the search pagination for this outcode.
|
|
if detail_state["fetched"] >= detail_cap:
|
|
return
|
|
if detail_deadline is not None and time.monotonic() >= detail_deadline:
|
|
return
|
|
listing["_detail"] = fetch_detail(url)
|
|
if not cached:
|
|
detail_state["fetched"] += 1
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
all_listings = _extract_listings(page)
|
|
for listing in all_listings:
|
|
_maybe_fetch(listing)
|
|
if max_properties is not None and len(all_listings) >= max_properties:
|
|
return all_listings[:max_properties]
|
|
|
|
if not all_listings:
|
|
return all_listings
|
|
|
|
seen_ids = {listing["id"] for listing in all_listings}
|
|
page_num = 2
|
|
|
|
while True:
|
|
next_url = _find_next_page_url(page)
|
|
if not next_url:
|
|
if total_results > 0 and len(all_listings) >= total_results:
|
|
break
|
|
next_url = _url_with_page(page.url, page_num)
|
|
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
try:
|
|
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
|
_ensure_not_challenged(page)
|
|
_wait_for_listing_content(page)
|
|
except TurnstileError:
|
|
raise
|
|
except Exception as e:
|
|
detail = _exception_detail(e)
|
|
log.warning(
|
|
"Zoopla pagination navigation failed at page %d: %s",
|
|
page_num,
|
|
detail,
|
|
)
|
|
raise RuntimeError(
|
|
f"Zoopla pagination navigation failed at page {page_num}: {detail}"
|
|
) from e
|
|
|
|
page_listings = _extract_listings(page)
|
|
if not page_listings:
|
|
if total_results > len(all_listings):
|
|
raise RuntimeError(
|
|
"Zoopla pagination stopped with no listings on page "
|
|
f"{page_num}; collected {len(all_listings)} of "
|
|
f"{total_results} advertised results"
|
|
)
|
|
break
|
|
|
|
# Deduplicate within this outcode
|
|
new_count = 0
|
|
for listing in page_listings:
|
|
if listing["id"] not in seen_ids:
|
|
seen_ids.add(listing["id"])
|
|
all_listings.append(listing)
|
|
_maybe_fetch(listing)
|
|
new_count += 1
|
|
if max_properties is not None and len(all_listings) >= max_properties:
|
|
return all_listings[:max_properties]
|
|
|
|
if new_count == 0:
|
|
if total_results > len(all_listings):
|
|
raise RuntimeError(
|
|
"Zoopla pagination repeated results on page "
|
|
f"{page_num}; collected {len(all_listings)} of "
|
|
f"{total_results} advertised results"
|
|
)
|
|
break
|
|
|
|
page_num += 1
|
|
|
|
if total_results > 0 and len(all_listings) >= total_results:
|
|
if not _find_next_page_url(page):
|
|
break
|
|
|
|
return all_listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Property transformation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
|
|
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
|
|
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
|
|
|
|
|
|
def _resolve_outcode_coords(
|
|
outcode: str, pc_coords: dict[str, tuple[float, float]]
|
|
) -> tuple[str, float, float] | None:
|
|
"""Find first postcode + coords for an outcode. Result is cached."""
|
|
if outcode in _outcode_coords_cache:
|
|
return _outcode_coords_cache[outcode]
|
|
|
|
prefix = outcode + " "
|
|
for pcd, (lat, lng) in pc_coords.items():
|
|
if pcd.startswith(prefix) or (
|
|
len(outcode) >= 4
|
|
and pcd.startswith(outcode)
|
|
and len(pcd) > len(outcode)
|
|
):
|
|
_outcode_coords_cache[outcode] = (pcd, lat, lng)
|
|
return (pcd, lat, lng)
|
|
|
|
_outcode_coords_cache[outcode] = None
|
|
return None
|
|
|
|
|
|
def _extract_outcode(text: str) -> str | None:
|
|
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
|
|
# Look for outcode at end of string or after last comma
|
|
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper()
|
|
# Try after comma
|
|
parts = text.split(",")
|
|
if len(parts) > 1:
|
|
last = parts[-1].strip()
|
|
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper()
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Detail-page geocoding
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Zoopla search result cards only expose an outcode-level display address (e.g.
|
|
# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
|
|
# only on each listing's detail page (/for-sale/details/{id}/). The detail page
|
|
# is a Next.js App Router route whose React Server Components flight stream
|
|
# embeds the property's own location object, e.g.
|
|
# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
|
|
# "uprn":"10023461458","postalCode":"NR29 4RG",...}
|
|
# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
|
|
# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
|
|
# Nearby points of interest (stations, schools, EV chargers) and comparable
|
|
# listings carry their own "coordinates" too, but never inside the property's
|
|
# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
|
|
# loose coordinates object, is what we anchor on (see parse_detail_geo).
|
|
|
|
# listingId -> parsed detail dict (or None). Failures are cached too, so a
|
|
# broken listing is not re-fetched within a run (the same listing reappears
|
|
# across overlapping outcode searches).
|
|
_detail_cache: dict[str, dict | None] = {}
|
|
|
|
_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
|
|
|
|
# The property's own location is carried by a `"location":{...}` wrapper and a
|
|
# twin `"address":{"fullAddress":...}` widget object. We anchor on those
|
|
# wrappers (and capture their full object body, which contains exactly one
|
|
# nested object — `coordinates`) rather than scanning for loose coordinate
|
|
# objects: nearby points of interest (stations/schools/EV chargers) and
|
|
# comparable/"similar" listings also embed coordinates, but never inside the
|
|
# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
|
|
# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
|
|
_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
|
|
_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
|
|
_DETAIL_COORDS_IN_BODY_RE = re.compile(
|
|
r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
|
|
)
|
|
_DETAIL_LATLNG_IN_BODY_RE = re.compile(
|
|
r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
|
|
)
|
|
_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
|
|
# The location object spells it "postalCode"; the address twin uses "postcode".
|
|
_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
|
|
# The UPRN (Unique Property Reference Number) appears in both the location and
|
|
# address objects and is the linchpin for an exact listing->EPC join (EPC open
|
|
# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
|
|
# (e.g. "12", "Martham Mill") only in the location object.
|
|
_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
|
|
_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
|
|
|
|
|
|
def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
|
|
"""Extract the property's own coordinates/postcode from a Zoopla detail page.
|
|
|
|
Pure and browser-free: the live browser only produces the HTML string
|
|
(``page.content()``); this does the parsing so it is unit-testable.
|
|
|
|
Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
|
|
"number_or_name", "full_address"}`` (every field except the coordinates may
|
|
be ``None``) or ``None`` when no property location wrapper is found. The
|
|
``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
|
|
number/name) and ``full_address`` give a register-style address for the
|
|
Price Paid join.
|
|
Coordinates are bounds-checked to England and a postcode is kept only when
|
|
it agrees with its own object's outcode. ``search_outcode``, when given, is
|
|
used only as a tie-break to pick the right ``location`` object on pages that
|
|
also embed comparable listings. See module docstring for the data model."""
|
|
if not html:
|
|
return None
|
|
|
|
# RSC flight strings are embedded as escaped JS string literals, so quotes
|
|
# and slashes arrive escaped; normalize them so the regexes match.
|
|
buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
|
|
|
|
def in_england(lat: float, lng: float) -> tuple[float, float] | None:
|
|
lat, lng = fix_coords(lat, lng)
|
|
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
|
return lat, lng
|
|
return None
|
|
|
|
def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
|
|
# outcode and postcode are read from the SAME object body as the coords,
|
|
# so the postcode is self-consistent; drop it only if it somehow isn't.
|
|
outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
|
|
outcode = outcode_match.group(1) if outcode_match else None
|
|
postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
|
|
postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
|
|
if postcode and outcode and extract_outcode(postcode) != outcode.upper():
|
|
postcode = None
|
|
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
|
|
number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
|
|
number_or_name = number_match.group(1).strip() if number_match else None
|
|
return {
|
|
"lat": coords[0],
|
|
"lng": coords[1],
|
|
"postcode": postcode,
|
|
"outcode": outcode,
|
|
"source": source,
|
|
"uprn": uprn_match.group(1) if uprn_match else None,
|
|
"number_or_name": number_or_name or None,
|
|
"full_address": full_address,
|
|
}
|
|
|
|
def attach_full_address(result: dict | None) -> dict | None:
|
|
# The house-numbered street address lives in the `address` map-widget
|
|
# twin, not the `location` wrapper we anchor coordinates on. Pull it from
|
|
# the twin that shares this property's uprn; when there is no uprn to
|
|
# disambiguate, fall back to the first twin (document order = primary
|
|
# listing), but never guess a twin when a uprn exists and none matches —
|
|
# that would risk grabbing a comparable listing's address.
|
|
if result is None or result.get("full_address"):
|
|
return result
|
|
target = result.get("uprn")
|
|
first = None
|
|
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
|
full_address = match.group(1) or None
|
|
if full_address is None:
|
|
continue
|
|
if first is None:
|
|
first = full_address
|
|
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
|
|
if target and uprn_match and uprn_match.group(1) == target:
|
|
result["full_address"] = full_address
|
|
return result
|
|
if target is None:
|
|
result["full_address"] = first
|
|
return result
|
|
|
|
# Strategy 1 — the property's own `location` wrapper (authoritative). Take
|
|
# the first match (the primary listing precedes any comparables in the
|
|
# flight stream), but prefer one whose outcode matches the searched outcode.
|
|
first_location = None
|
|
for match in _DETAIL_LOCATION_RE.finditer(buf):
|
|
body = match.group(1)
|
|
coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
|
|
if not coords_match:
|
|
continue
|
|
coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
|
|
if not coords:
|
|
continue
|
|
candidate = build(body, coords, "detail_location")
|
|
if first_location is None:
|
|
first_location = candidate
|
|
if (
|
|
search_outcode
|
|
and candidate["outcode"]
|
|
and candidate["outcode"].upper() == search_outcode.upper()
|
|
):
|
|
return attach_full_address(candidate)
|
|
if first_location is not None:
|
|
return attach_full_address(first_location)
|
|
|
|
# Strategy 2 — the `address` map-widget twin (same coordinates, backup).
|
|
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
|
full_address = match.group(1) or None
|
|
body = match.group(2)
|
|
latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
|
|
if not latlng_match:
|
|
continue
|
|
coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
|
|
if coords:
|
|
return build(body, coords, "detail_address_obj", full_address=full_address)
|
|
|
|
return None
|
|
|
|
|
|
def _detail_cache_key(listing_url: str) -> str:
|
|
"""Cache key for a listing detail page — its numeric id when present."""
|
|
id_match = _LISTING_ID_RE.search(listing_url)
|
|
return id_match.group(1) if id_match else listing_url
|
|
|
|
|
|
def _fetch_listing_detail(
|
|
detail_page,
|
|
listing_url: str,
|
|
search_outcode: str | None = None,
|
|
) -> dict | None:
|
|
"""Load a listing detail page and return its parsed geo dict (or None).
|
|
|
|
Results (including failures) are cached by listingId. Ordinary navigation
|
|
and extraction errors are swallowed so the caller can fall back to
|
|
outcode-level resolution, but TurnstileError is allowed to propagate so the
|
|
scraper's "Cloudflare ends the run" contract still holds. The goto timeout
|
|
is kept short so one slow detail page can't eat the per-outcode budget."""
|
|
cache_key = _detail_cache_key(listing_url)
|
|
if cache_key in _detail_cache:
|
|
return _detail_cache[cache_key]
|
|
|
|
url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
|
|
result: dict | None = None
|
|
try:
|
|
detail_page.goto(
|
|
url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
|
|
)
|
|
_ensure_not_challenged(detail_page)
|
|
html = detail_page.content()
|
|
result = parse_detail_geo(html, search_outcode=search_outcode)
|
|
except TurnstileError:
|
|
raise
|
|
except Exception as exc:
|
|
log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
|
|
result = None
|
|
|
|
_detail_cache[cache_key] = result
|
|
return result
|
|
|
|
|
|
def _map_property_type(raw_type: str | None) -> str:
|
|
"""Map Zoopla property type text to canonical type."""
|
|
if not raw_type:
|
|
return "Other"
|
|
# Exact match (handles Rightmove-style capitalised values)
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
|
if canonical:
|
|
return canonical
|
|
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
|
if canonical:
|
|
return canonical
|
|
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
|
if canonical:
|
|
return canonical
|
|
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
|
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
|
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
|
if canonical:
|
|
return canonical
|
|
# Keyword fallback
|
|
lower = raw_type.lower()
|
|
excluded_flat_like = (
|
|
"block of apartment",
|
|
"house of multiple occupation",
|
|
"private halls",
|
|
"retirement",
|
|
"serviced apartment",
|
|
)
|
|
if any(term in lower for term in excluded_flat_like):
|
|
return "Other"
|
|
if (
|
|
"flat" in lower
|
|
or "apartment" in lower
|
|
or "maisonette" in lower
|
|
or "studio" in lower
|
|
or "penthouse" in lower
|
|
):
|
|
return "Flats/Maisonettes"
|
|
if "semi" in lower and "detach" in lower:
|
|
return "Semi-Detached"
|
|
if "detach" in lower:
|
|
return "Detached"
|
|
if "terrace" in lower or "mews" in lower:
|
|
return "Terraced"
|
|
if "house" in lower:
|
|
return "Detached"
|
|
return "Other"
|
|
|
|
|
|
def transform_property(
|
|
raw: dict,
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
search_outcode: str | None = None,
|
|
detail: dict | None = None,
|
|
) -> dict | None:
|
|
"""Transform a raw Zoopla listing dict into the standard output schema.
|
|
|
|
Zoopla search cards only expose an outcode-level address, so precise
|
|
location comes from the listing's detail page (see ``parse_detail_geo`` /
|
|
``_fetch_listing_detail``), passed in as ``detail``. When detail-page
|
|
coordinates are available we resolve the nearest postcode via the spatial
|
|
index — mirroring rightmove/onthemarket — and only fall back to the coarse
|
|
outcode centroid when no detail location could be obtained."""
|
|
price = parse_int_value(raw.get("price")) or 0
|
|
|
|
address = raw.get("address", "") or ""
|
|
|
|
extracted_postcode = extract_full_postcode(address)
|
|
detail = detail or {}
|
|
detail_postcode = extract_full_postcode(detail.get("postcode"))
|
|
# Detail-page address fields: the UPRN keys an exact EPC join, and the
|
|
# full street address / house number-or-name beat the outcode-level card
|
|
# address for the Price-Paid join. All three are absent unless the detail
|
|
# page was fetched, so every consumer must tolerate None.
|
|
detail_uprn = detail.get("uprn") or None
|
|
detail_full_address = detail.get("full_address") or None
|
|
detail_number_or_name = detail.get("number_or_name") or None
|
|
|
|
postcode = postcode_source = inferred_postcode = None
|
|
lat = lng = None
|
|
|
|
# (A) Best: detail-page coordinates -> nearest postcode (authoritative).
|
|
detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
|
|
if detail_lat is not None and detail_lng is not None:
|
|
fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
|
|
if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
|
|
nearest = pc_index.nearest(fixed_lat, fixed_lng)
|
|
if nearest:
|
|
lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
|
|
candidate = detail_postcode or extracted_postcode
|
|
postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
|
|
postcode_source = (
|
|
"detail_address"
|
|
if resolved_source == "address"
|
|
else "detail_coordinates"
|
|
)
|
|
|
|
# (B) Detail-page postcode without usable coordinates -> geocode it.
|
|
if lat is None and detail_postcode and detail_postcode in pc_coords:
|
|
lat, lng = pc_coords[detail_postcode]
|
|
postcode = inferred_postcode = detail_postcode
|
|
postcode_source = "detail_address"
|
|
|
|
# (C) Full postcode in the search-card address -> geocode it.
|
|
if lat is None and extracted_postcode and extracted_postcode in pc_coords:
|
|
lat, lng = pc_coords[extracted_postcode]
|
|
postcode = extracted_postcode
|
|
postcode_source = "address"
|
|
|
|
# (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
|
|
if lat is None:
|
|
addr_outcode = _extract_outcode(address)
|
|
if addr_outcode:
|
|
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
|
if result:
|
|
postcode, lat, lng = result
|
|
postcode_source = "address_outcode"
|
|
|
|
if lat is None and search_outcode:
|
|
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
|
if result:
|
|
postcode, lat, lng = result
|
|
postcode_source = "search_outcode"
|
|
|
|
if lat is None or lng is None or not postcode:
|
|
return None
|
|
|
|
# Validate coordinates are in England
|
|
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
|
return None
|
|
|
|
raw_beds = parse_int_value(raw.get("beds")) or 0
|
|
raw_baths = parse_int_value(raw.get("baths")) or 0
|
|
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
|
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
|
log.warning(
|
|
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
|
raw.get("id", "?"), raw_beds, raw_baths,
|
|
)
|
|
receptions = raw.get("receptions") or 0
|
|
|
|
# Floor area: convert sq ft to sq m
|
|
floor_area_sqm = None
|
|
raw_sqm = raw.get("floor_area_sqm")
|
|
if raw_sqm:
|
|
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
|
|
else:
|
|
sqft = raw.get("floor_area_sqft")
|
|
if sqft:
|
|
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
|
|
|
|
listing_id = raw.get("id", "")
|
|
listing_url = raw.get("url", "")
|
|
if listing_url and not listing_url.startswith("http"):
|
|
listing_url = ZOOPLA_BASE + listing_url
|
|
|
|
return {
|
|
"id": f"zp_{listing_id}",
|
|
"Bedrooms": bedrooms,
|
|
"Bathrooms": bathrooms,
|
|
"Number of bedrooms & living rooms": bedrooms + receptions,
|
|
"lon": lng,
|
|
"lat": lat,
|
|
"Postcode": postcode,
|
|
"Postcode source": postcode_source or "unknown",
|
|
"Extracted postcode": extracted_postcode,
|
|
"Inferred postcode": (
|
|
inferred_postcode
|
|
if inferred_postcode is not None
|
|
else (postcode if postcode_source != "address" else None)
|
|
),
|
|
"Listing raw address": detail_full_address or address,
|
|
"Address per Property Register": build_register_address(
|
|
detail_full_address or address, detail_number_or_name
|
|
),
|
|
"UPRN": detail_uprn,
|
|
"Property number or name": detail_number_or_name,
|
|
"Leasehold/Freehold": raw.get("tenure") or None,
|
|
"Property type": _map_property_type(raw.get("property_type")),
|
|
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
|
"price": price,
|
|
"price_frequency": "",
|
|
"Price qualifier": "",
|
|
"Total floor area (sqm)": floor_area_sqm,
|
|
"Listing URL": listing_url,
|
|
"Listing features": [],
|
|
"first_visible_date": "",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Top-level search function (called by scraper.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def search_outcode(
|
|
page,
|
|
outcode: str,
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
max_properties: int | None = None,
|
|
detail_page=None,
|
|
detail_cap: int = 0,
|
|
detail_budget_seconds: float | None = None,
|
|
) -> tuple[list[dict], str | None]:
|
|
"""Search Zoopla for properties in one outcode.
|
|
|
|
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
|
search flow, extracts listings from rendered DOM, and transforms to the
|
|
standard output schema.
|
|
|
|
When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
|
|
are supplied, up to ``detail_cap`` listings per outcode have their detail
|
|
page fetched for a precise postcode (see ``_fetch_listing_detail``).
|
|
``detail_budget_seconds`` caps the wall-clock time spent fetching details so
|
|
the per-outcode timeout that also guards search pagination is never starved.
|
|
|
|
Returns (properties, search_url).
|
|
|
|
Raises TurnstileError if Cloudflare blocks us mid-session.
|
|
"""
|
|
if not _navigate_search(page, outcode):
|
|
return [], None
|
|
|
|
total_results = _get_result_count(page)
|
|
|
|
fetch_detail = None
|
|
detail_deadline = None
|
|
if detail_page is not None and detail_cap > 0:
|
|
fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731
|
|
detail_page, url, search_outcode=outcode
|
|
)
|
|
if detail_budget_seconds is not None:
|
|
detail_deadline = time.monotonic() + detail_budget_seconds
|
|
|
|
# Always try extraction even if result count is 0 — the count regex may
|
|
# not match Zoopla's current text format, but listings may still be in DOM
|
|
raw_listings = _paginate(
|
|
page,
|
|
total_results,
|
|
max_properties=max_properties,
|
|
fetch_detail=fetch_detail,
|
|
detail_cap=detail_cap,
|
|
detail_state={"fetched": 0},
|
|
detail_deadline=detail_deadline,
|
|
)
|
|
if not raw_listings:
|
|
if total_results > 0:
|
|
log.debug(
|
|
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
|
"DOM selectors may need updating",
|
|
outcode, "BUY", total_results,
|
|
)
|
|
return [], None
|
|
|
|
properties = []
|
|
dropped = 0
|
|
for raw in raw_listings:
|
|
try:
|
|
transformed = transform_property(
|
|
raw,
|
|
pc_index,
|
|
pc_coords,
|
|
search_outcode=outcode,
|
|
detail=raw.get("_detail"),
|
|
)
|
|
except Exception as exc:
|
|
log.warning(
|
|
"Zoopla %s property %s failed to transform: %s",
|
|
outcode,
|
|
raw.get("id", "?"),
|
|
exc,
|
|
)
|
|
transformed = None
|
|
if transformed:
|
|
properties.append(transformed)
|
|
else:
|
|
dropped += 1
|
|
|
|
if dropped and not properties:
|
|
# Log a sample raw listing to diagnose which fields are missing
|
|
sample = raw_listings[0] if raw_listings else {}
|
|
log.debug(
|
|
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
|
"(no postcode/coords). Sample raw: price=%s address=%r",
|
|
outcode, "BUY", len(raw_listings), dropped,
|
|
sample.get("price"), sample.get("address", ""),
|
|
)
|
|
elif dropped > len(raw_listings) // 2:
|
|
log.debug(
|
|
"Zoopla %s %s: %d/%d listings dropped in transform",
|
|
outcode, "BUY", dropped, len(raw_listings),
|
|
)
|
|
|
|
return properties, page.url
|