diff --git a/Dockerfile b/Dockerfile
index 4f6c9a1..481d385 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
# Stage 1: Build frontend
-FROM node:20-slim AS frontend
+FROM node:22-slim AS frontend
WORKDIR /app/frontend
COPY frontend/package.json frontend/package-lock.json ./
RUN npm ci
@@ -7,7 +7,7 @@ COPY frontend/ ./
RUN npm run build:no-prerender
# Stage 2: Build Rust server
-FROM rust:1.83-bookworm AS server
+FROM rust:1.84-bookworm AS server
WORKDIR /app
COPY server-rs/ server-rs/
WORKDIR /app/server-rs
diff --git a/finder/Dockerfile b/finder/Dockerfile
index c975550..00c0344 100644
--- a/finder/Dockerfile
+++ b/finder/Dockerfile
@@ -5,9 +5,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
WORKDIR /app
COPY pyproject.toml ./
RUN uv pip install --system -r pyproject.toml
-RUN playwright install --with-deps chromium
+RUN playwright install-deps firefox
+RUN camoufox fetch \
+ && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
COPY *.py ./
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"
+
CMD ["python3", "main.py"]
diff --git a/finder/constants.py b/finder/constants.py
index 1f863e5..3c75f9d 100644
--- a/finder/constants.py
+++ b/finder/constants.py
@@ -31,6 +31,11 @@ SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
"true",
"yes",
)
+SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
+ "1",
+ "true",
+ "yes",
+)
# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
RELOAD_URL = os.environ.get("RELOAD_URL", "")
@@ -47,6 +52,9 @@ HOMECOUK_PER_PAGE = 30 # max supported by the API
# OpenRent
OPENRENT_BASE = "https://www.openrent.co.uk"
+# Zoopla
+ZOOPLA_BASE = "https://www.zoopla.co.uk"
+
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
diff --git a/finder/main.py b/finder/main.py
index 3174d00..b68f824 100644
--- a/finder/main.py
+++ b/finder/main.py
@@ -14,6 +14,7 @@ from constants import (
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
+ SCRAPE_ZOOPLA,
)
from homecouk import load_cookies as load_homecouk_cookies
from openrent import load_cookies as load_openrent_cookies
@@ -48,6 +49,16 @@ log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
+
+# Suppress noisy /metrics and /health request logs from werkzeug
+class _NoiseFilter(logging.Filter):
+ def filter(self, record):
+ msg = record.getMessage()
+ return "GET /metrics" not in msg and "GET /health" not in msg
+
+
+logging.getLogger("werkzeug").addFilter(_NoiseFilter())
+
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
@@ -55,13 +66,14 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
-PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
+PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
log.info(
- "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
+ "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
len(OUTCODES),
SCRAPE_RIGHTMOVE,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
+ SCRAPE_ZOOPLA,
)
# ---------------------------------------------------------------------------
@@ -121,6 +133,11 @@ if SCHEDULE_HOUR >= 0:
app = Flask(__name__)
+@app.route("/health")
+def health():
+ return "ok", 200
+
+
@app.route("/run", methods=["POST"])
def trigger_run():
if _start_scrape():
@@ -147,6 +164,7 @@ def get_status():
"rightmove": status.rm_properties,
"homecouk": status.hk_properties,
"openrent": status.or_properties,
+ "zoopla": status.zp_properties,
},
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
@@ -167,8 +185,10 @@ def get_debug():
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
+ "scrape_zoopla": SCRAPE_ZOOPLA,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
+ "zoopla_note": "browser-based (Camoufox), no cookies needed",
}
)
diff --git a/finder/metrics.py b/finder/metrics.py
index 134cc7f..df8ae26 100644
--- a/finder/metrics.py
+++ b/finder/metrics.py
@@ -109,6 +109,28 @@ openrent_properties_scraped = Counter(
["channel"],
)
+# ---------------------------------------------------------------------------
+# Counters — Zoopla
+# ---------------------------------------------------------------------------
+
+zoopla_pages_scraped = Counter(
+ "zoopla_pages_scraped",
+ "Search result pages scraped from Zoopla",
+ ["channel"],
+)
+
+zoopla_errors_total = Counter(
+ "zoopla_errors_total",
+ "Zoopla scraping errors",
+ ["type"],
+)
+
+zoopla_properties_scraped = Counter(
+ "zoopla_properties_scraped",
+ "Properties scraped from Zoopla (before dedup)",
+ ["channel"],
+)
+
# ---------------------------------------------------------------------------
# Counters — FlareSolverr / cookie management
# ---------------------------------------------------------------------------
@@ -138,3 +160,8 @@ openrent_enabled = Gauge(
"openrent_enabled",
"Whether OpenRent scraping is currently active (1=yes, 0=no)",
)
+
+zoopla_enabled = Gauge(
+ "zoopla_enabled",
+ "Whether Zoopla scraping is currently active (1=yes, 0=no)",
+)
diff --git a/finder/scraper.py b/finder/scraper.py
index 8728008..b5e9d3e 100644
--- a/finder/scraper.py
+++ b/finder/scraper.py
@@ -17,6 +17,7 @@ from constants import (
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
+ SCRAPE_ZOOPLA,
SEED,
)
from homecouk import CookiesExpiredError
@@ -35,12 +36,16 @@ from metrics import (
scrape_outcodes_total,
scrape_properties_total,
scrape_state,
+ zoopla_enabled,
)
from openrent import WafChallengeError
from openrent import load_cookies as load_openrent_cookies
from openrent import make_client as make_openrent_client
from openrent import search_outcode as openrent_search_outcode
from rightmove import resolve_outcode_id, search_outcode
+from zoopla import TurnstileError
+from zoopla import launch_browser as launch_zoopla_browser
+from zoopla import search_outcode as zoopla_search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
@@ -60,6 +65,7 @@ class ScrapeStatus:
rm_properties: int = 0
hk_properties: int = 0
or_properties: int = 0
+ zp_properties: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
@@ -93,6 +99,9 @@ def _sync_gauges() -> None:
scrape_properties_total.labels(channel=ch, source="openrent").set(
status.or_properties
)
+ scrape_properties_total.labels(channel=ch, source="zoopla").set(
+ status.zp_properties
+ )
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
@@ -191,7 +200,7 @@ def run_scrape(
random.seed(SEED)
random.shuffle(shuffled)
- if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
+ if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
log.warning("All scrapers disabled — nothing to do")
with status_lock:
status.state = "done"
@@ -239,8 +248,27 @@ def run_scrape(
)
openrent_enabled.set(0)
- # Build postcode coords if OpenRent is active and caller didn't provide them
- if or_client and pc_coords is None:
+ # Zoopla: uses Camoufox browser (no cookies/client pattern)
+ zp_browser = None
+ zp_page = None
+ zp_failed = False
+ if not SCRAPE_ZOOPLA:
+ log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
+ zoopla_enabled.set(0)
+ else:
+ try:
+ zp_browser, zp_page = launch_zoopla_browser()
+ log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
+ zoopla_enabled.set(1)
+ except TurnstileError:
+ log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
+ zoopla_enabled.set(0)
+ except Exception as e:
+ log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
+ zoopla_enabled.set(0)
+
+ # Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
+ if (or_client or zp_page) and pc_coords is None:
pc_coords = build_postcode_coords()
try:
@@ -256,6 +284,8 @@ def run_scrape(
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
or_count = 0 # OpenRent properties this channel
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
+ zp_count = 0 # Zoopla properties this channel
+ zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
with status_lock:
status.channel = channel_name
@@ -264,6 +294,7 @@ def run_scrape(
status.rm_properties = 0
status.hk_properties = 0
status.or_properties = 0
+ status.zp_properties = 0
channel_start = time.time()
prev_prop_milestone = 0 # last 10k milestone we logged
@@ -412,6 +443,63 @@ def run_scrape(
with status_lock:
status.errors.append(msg)
+ # --- Zoopla ---
+ if zp_page and not zp_failed:
+ made_requests = True
+ try:
+ zp_props = zoopla_search_outcode(
+ zp_page,
+ outcode,
+ channel_name,
+ pc_index,
+ pc_coords,
+ )
+ for p in zp_props:
+ pid = p["id"]
+ key = _dedup_key(p)
+ if pid in all_properties or key in seen_dedup_keys:
+ zp_dedup_count += 1
+ cross_source_dedup_total.labels(
+ channel="buy" if channel_name == "BUY" else "rent",
+ ).inc()
+ continue
+ all_properties[pid] = p
+ seen_dedup_keys.add(key)
+ zp_count += 1
+ if zp_props:
+ log.info(
+ "Zoopla %s: +%d properties", outcode, len(zp_props)
+ )
+ except TurnstileError:
+ log.warning(
+ "Zoopla Cloudflare challenge failed — attempting browser relaunch"
+ )
+ try:
+ zp_browser.close()
+ except Exception:
+ pass
+ try:
+ zp_browser, zp_page = launch_zoopla_browser()
+ log.info("Zoopla browser relaunched, continuing")
+ except Exception:
+ log.warning(
+ "Browser relaunch failed, disabling Zoopla for rest of scrape"
+ )
+ zp_page = None
+ zp_browser = None
+ zp_failed = True
+ zoopla_enabled.set(0)
+ with status_lock:
+ status.errors.append(
+ "Zoopla Cloudflare challenge failed and browser relaunch failed"
+ )
+ except Exception as e:
+ msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
+ log.error(msg)
+ scrape_errors_total.labels(source="zoopla").inc()
+ with status_lock:
+ status.errors.append(msg)
+
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
@@ -420,6 +508,7 @@ def run_scrape(
status.rm_properties = rm_count
status.hk_properties = hk_count
status.or_properties = or_count
+ status.zp_properties = zp_count
_sync_gauges()
# Log progress every 100 outcodes
@@ -444,12 +533,13 @@ def run_scrape(
if current_milestone > prev_prop_milestone:
prev_prop_milestone = current_milestone
log.info(
- "%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
+ "%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
channel_name,
current_milestone * 10,
rm_count,
hk_count,
or_count,
+ zp_count,
done,
len(shuffled),
_fmt_elapsed(elapsed),
@@ -472,13 +562,14 @@ def run_scrape(
_sync_gauges()
log.info(
- "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
+ "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
channel_name,
len(deduped),
rm_count,
hk_count,
or_count,
- hk_dedup_count + or_dedup_count,
+ zp_count,
+ hk_dedup_count + or_dedup_count + zp_dedup_count,
)
with status_lock:
@@ -525,3 +616,8 @@ def run_scrape(
hk_client.close()
if or_client:
or_client.close()
+ if zp_browser:
+ try:
+ zp_browser.close()
+ except Exception:
+ pass
diff --git a/finder/storage.py b/finder/storage.py
index 1004bee..9854188 100644
--- a/finder/storage.py
+++ b/finder/storage.py
@@ -25,7 +25,11 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
if fvd:
try:
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
- listing_dates.append(dt.replace(tzinfo=None))
+ # Convert to UTC naive datetime for consistent storage
+ if dt.tzinfo is not None:
+ from datetime import timezone
+ dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
+ listing_dates.append(dt)
except (ValueError, TypeError):
listing_dates.append(None)
else:
diff --git a/finder/zoopla.py b/finder/zoopla.py
new file mode 100644
index 0000000..4cddc17
--- /dev/null
+++ b/finder/zoopla.py
@@ -0,0 +1,520 @@
+"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
+
+Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
+blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
+patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
+
+Zoopla uses Next.js App Router with React Server Components (RSC). Search
+result data is server-rendered in an RSC stream, not available via
+__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
+the working flow requires typing into the autocomplete input, selecting a
+suggestion, and clicking Search.
+
+Architecture:
+ Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
+ a single Camoufox browser alive for the entire scrape. For each outcode, it:
+ 1. Clears and types the outcode into the search input
+ 2. Selects the first autocomplete suggestion
+ 3. Clicks Search
+ 4. Extracts listing data from the rendered DOM
+ 5. Handles pagination via ?pn=N parameter
+
+ The browser session replaces the cookie/client pattern used by other scrapers.
+"""
+
+import logging
+import re
+import time
+
+from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
+from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
+from spatial import PostcodeSpatialIndex
+
+log = logging.getLogger("zoopla")
+
+
+class TurnstileError(Exception):
+ """Raised when Cloudflare Turnstile challenge cannot be passed."""
+
+
+# Maximum search result pages to scrape per outcode (25 listings/page)
+MAX_PAGES_PER_OUTCODE = 10
+
+# JavaScript to extract listings from the rendered DOM.
+# Finds all detail links, walks up to the card container, and parses
+# price, beds, baths, floor area, address, and tenure from the card text.
+_EXTRACT_LISTINGS_JS = r"""() => {
+ const links = Array.from(document.querySelectorAll(
+ 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+ ));
+
+ const seen = new Set();
+ const results = [];
+
+ for (const link of links) {
+ const href = link.href;
+ const match = href.match(/\/details\/(\d+)\//);
+ if (!match) continue;
+
+ const id = match[1];
+ if (seen.has(id)) continue;
+ seen.add(id);
+
+ // Walk up to the listing card container
+ let card = link;
+ for (let j = 0; j < 10; j++) {
+ card = card.parentElement;
+ if (!card) break;
+ const text = card.innerText || '';
+ if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
+ break;
+ }
+ }
+ if (!card) continue;
+
+ const text = card.innerText || '';
+ const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+
+ const priceMatch = text.match(/\u00a3([\d,]+)/);
+ const bedsMatch = text.match(/(\d+)\s*beds?/i);
+ const bathsMatch = text.match(/(\d+)\s*baths?/i);
+ const recMatch = text.match(/(\d+)\s*reception/i);
+ const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
+
+ let address = '';
+ for (const line of lines) {
+ if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+ (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+ address = line;
+ break;
+ }
+ }
+
+ let tenure = '';
+ if (/freehold/i.test(text)) tenure = 'Freehold';
+ else if (/leasehold/i.test(text)) tenure = 'Leasehold';
+
+ results.push({
+ id: id,
+ url: href.replace(window.location.origin, ''),
+ price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
+ beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
+ baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
+ receptions: recMatch ? parseInt(recMatch[1]) : null,
+ floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
+ address: address,
+ tenure: tenure,
+ });
+ }
+
+ return results;
+}"""
+
+# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
+_DISMISS_COOKIES_JS = """() => {
+ const aside = document.querySelector('#usercentrics-cmp-ui');
+ if (aside && aside.shadowRoot) {
+ const btns = aside.shadowRoot.querySelectorAll('button');
+ for (const btn of btns) {
+ if (btn.innerText.includes('Accept')) { btn.click(); return true; }
+ }
+ }
+ if (aside) { aside.remove(); return true; }
+ return false;
+}"""
+
+
+# ---------------------------------------------------------------------------
+# Browser lifecycle
+# ---------------------------------------------------------------------------
+
+
+def launch_browser():
+ """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
+ and dismiss cookie consent. Returns (browser, page) tuple.
+
+ Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
+ Caller must close browser when done."""
+ from camoufox.pkgman import camoufox_path
+
+ # Verify camoufox is pre-installed — never download at runtime
+ camoufox_path(download_if_missing=False)
+
+ from camoufox.sync_api import Camoufox
+
+ log.info("Launching Camoufox browser for Zoopla...")
+ browser = Camoufox(headless=True).__enter__()
+ page = browser.new_page()
+
+ log.info("Navigating to Zoopla homepage...")
+ page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
+
+ # Wait for Cloudflare Turnstile to resolve.
+ # Try clicking the Turnstile checkbox if present (helps in some cases).
+ for i in range(20):
+ if "Just a moment" not in page.title():
+ break
+ # Attempt to click the Turnstile checkbox in the challenge iframe
+ for frame in page.frames:
+ if "challenges.cloudflare.com" in frame.url:
+ try:
+ iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
+ if iframe_el:
+ box = iframe_el.bounding_box()
+ if box:
+ page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
+ except Exception:
+ pass
+ break
+ time.sleep(3)
+ else:
+ page.close()
+ browser.close()
+ raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
+
+ log.info("Cloudflare passed — title: %s", page.title())
+ time.sleep(2)
+
+ # Dismiss cookie consent
+ page.evaluate(_DISMISS_COOKIES_JS)
+ time.sleep(1)
+
+ return browser, page
+
+
+def _ensure_not_challenged(page) -> None:
+ """Check if current page is a Cloudflare challenge and wait/raise."""
+ if "Just a moment" not in page.title():
+ return
+
+ log.warning("Cloudflare challenge detected mid-session, waiting...")
+ for i in range(20):
+ time.sleep(3)
+ if "Just a moment" not in page.title():
+ log.info("Cloudflare challenge resolved")
+ return
+
+ raise TurnstileError("Cloudflare re-challenge did not resolve")
+
+
+# ---------------------------------------------------------------------------
+# Search navigation
+# ---------------------------------------------------------------------------
+
+
+def _navigate_search(page, outcode: str, channel: str) -> bool:
+ """Navigate to search results for an outcode via the homepage search flow.
+
+ Returns True if results were found, False if no results or navigation failed.
+ Raises TurnstileError if Cloudflare blocks us."""
+ # Navigate to homepage to reset search state
+ page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
+ time.sleep(2)
+ _ensure_not_challenged(page)
+
+ # Dismiss cookie consent (may reappear after navigation)
+ page.evaluate(_DISMISS_COOKIES_JS)
+ time.sleep(1)
+
+ # Select Buy/Rent tab
+ if channel == "RENT":
+ rent_tab = page.query_selector(
+ 'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
+ )
+ if rent_tab:
+ rent_tab.click()
+ time.sleep(0.5)
+
+ # Find and fill search input
+ search_input = page.query_selector(
+ 'input[name="autosuggest-input"]'
+ ) or page.query_selector('input[type="text"]')
+ if not search_input:
+ log.warning("Could not find search input on homepage")
+ return False
+
+ search_input.click()
+ time.sleep(0.3)
+ search_input.fill("")
+ search_input.type(outcode, delay=60)
+ time.sleep(2)
+
+ # Select first autocomplete suggestion
+ first_option = page.query_selector('[role="option"]')
+ if not first_option:
+ log.debug("No autocomplete suggestions for outcode %s", outcode)
+ return False
+
+ first_option.click()
+ time.sleep(0.5)
+
+ # Click search button
+ search_btn = page.query_selector('button:has-text("Search")')
+ if search_btn:
+ search_btn.click()
+ else:
+ search_input.press("Enter")
+
+ # Wait for results to load
+ time.sleep(6)
+ _ensure_not_challenged(page)
+
+ return True
+
+
+def _get_result_count(page) -> int:
+ """Extract the total results count from the page body text."""
+ try:
+ body = page.inner_text("body")
+ match = re.search(r"([\d,]+)\s+results?", body)
+ if match:
+ return int(match.group(1).replace(",", ""))
+ except Exception:
+ pass
+ return 0
+
+
+# ---------------------------------------------------------------------------
+# Extraction and pagination
+# ---------------------------------------------------------------------------
+
+
+def _extract_listings(page) -> list[dict]:
+ """Extract listing data from the current search results page DOM."""
+ try:
+ return page.evaluate(_EXTRACT_LISTINGS_JS)
+ except Exception as e:
+ log.warning("Failed to extract listings from DOM: %s", e)
+ zoopla_errors_total.labels(type="extract_failed").inc()
+ return []
+
+
+def _paginate(page, total_results: int, channel: str) -> list[dict]:
+ """Extract listings from all pages of search results.
+
+ Page 1 is already loaded. For subsequent pages, clicks the Next button
+ or navigates via URL parameter ?pn=N."""
+ all_listings = _extract_listings(page)
+ channel_label = "buy" if channel == "BUY" else "rent"
+ zoopla_pages_scraped.labels(channel=channel_label).inc()
+
+ if not all_listings or total_results <= len(all_listings):
+ return all_listings
+
+ seen_ids = {l["id"] for l in all_listings}
+ current_url = page.url
+ page_num = 2
+
+ while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
+ time.sleep(DELAY_BETWEEN_PAGES)
+
+ # Try navigating via URL parameter
+ if "?" in current_url:
+ next_url = re.sub(r"[?&]pn=\d+", "", current_url)
+ separator = "&" if "?" in next_url else "?"
+ next_url = f"{next_url}{separator}pn={page_num}"
+ else:
+ next_url = f"{current_url}?pn={page_num}"
+
+ try:
+ page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
+ time.sleep(4)
+ _ensure_not_challenged(page)
+ except TurnstileError:
+ raise
+ except Exception as e:
+ log.debug("Pagination navigation failed at page %d: %s", page_num, e)
+ break
+
+ page_listings = _extract_listings(page)
+ if not page_listings:
+ break
+
+ # Deduplicate within this outcode
+ new_count = 0
+ for listing in page_listings:
+ if listing["id"] not in seen_ids:
+ seen_ids.add(listing["id"])
+ all_listings.append(listing)
+ new_count += 1
+
+ zoopla_pages_scraped.labels(channel=channel_label).inc()
+
+ if new_count == 0:
+ break # No new listings on this page
+
+ page_num += 1
+
+ return all_listings
+
+
+# ---------------------------------------------------------------------------
+# Property transformation
+# ---------------------------------------------------------------------------
+
+
+def _extract_postcode(text: str) -> str | None:
+ """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
+ match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
+ if match:
+ return match.group(1).upper().strip()
+ return None
+
+
+def _extract_outcode(text: str) -> str | None:
+ """Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
+ # Look for outcode at end of string or after last comma
+ match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
+ if match:
+ return match.group(1).upper()
+ # Try after comma
+ parts = text.split(",")
+ if len(parts) > 1:
+ last = parts[-1].strip()
+ match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
+ if match:
+ return match.group(1).upper()
+ return None
+
+
+def _map_property_type(raw_type: str | None) -> str:
+ """Map Zoopla property type text to canonical type."""
+ if not raw_type:
+ return "Other"
+ canonical = PROPERTY_TYPE_MAP.get(raw_type)
+ if canonical:
+ return canonical
+ lower = raw_type.lower()
+ if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
+ return "Flats/Maisonettes"
+ if "detached" in lower and "semi" not in lower:
+ return "Detached"
+ if "semi" in lower:
+ return "Semi-Detached"
+ if "terrace" in lower or "mews" in lower:
+ return "Terraced"
+ if "house" in lower:
+ return "Detached"
+ return "Other"
+
+
+def transform_property(
+ raw: dict,
+ channel: str,
+ pc_index: PostcodeSpatialIndex,
+ pc_coords: dict[str, tuple[float, float]],
+) -> dict | None:
+ """Transform a raw Zoopla listing dict into the standard output schema.
+
+ Zoopla search cards do not include coordinates, so we resolve lat/lng
+ from postcodes extracted from the address text."""
+ price = raw.get("price")
+ if not price:
+ return None
+
+ address = raw.get("address", "")
+
+ # Resolve postcode and coordinates from address
+ postcode = _extract_postcode(address)
+ lat = lng = None
+
+ if postcode:
+ coords = pc_coords.get(postcode)
+ if coords:
+ lat, lng = coords
+
+ if lat is None:
+ # Try outcode-level fallback
+ outcode = _extract_outcode(address)
+ if outcode:
+ prefix = outcode + " "
+ for pcd, coords in pc_coords.items():
+ if pcd.startswith(prefix):
+ postcode = pcd
+ lat, lng = coords
+ break
+
+ if lat is None or lng is None or not postcode:
+ return None
+
+ # Validate coordinates are in England
+ if not (49 <= lat <= 56 and -7 <= lng <= 2):
+ return None
+
+ bedrooms = raw.get("beds") or 0
+ bathrooms = raw.get("baths") or 0
+ receptions = raw.get("receptions") or 0
+
+ # Floor area: convert sq ft to sq m
+ floor_area_sqm = None
+ sqft = raw.get("floor_area_sqft")
+ if sqft:
+ floor_area_sqm = round(sqft * 0.092903, 1)
+
+ listing_id = raw.get("id", "")
+ listing_url = raw.get("url", "")
+ if listing_url and not listing_url.startswith("http"):
+ listing_url = ZOOPLA_BASE + listing_url
+
+ return {
+ "id": f"zp_{listing_id}",
+ "Bedrooms": bedrooms,
+ "Bathrooms": bathrooms,
+ "Number of bedrooms & living rooms": bedrooms + receptions,
+ "lon": lng,
+ "lat": lat,
+ "Postcode": postcode,
+ "Address per Property Register": address,
+ "Leasehold/Freehold": raw.get("tenure") or None,
+ "Property type": "Other", # Not reliably extractable from Zoopla search cards
+ "Property sub-type": "",
+ "price": int(price),
+ "price_frequency": "" if channel == "BUY" else "monthly",
+ "Price qualifier": "",
+ "Total floor area (sqm)": floor_area_sqm,
+ "Listing URL": listing_url,
+ "Listing features": [],
+ "first_visible_date": "",
+ }
+
+
+# ---------------------------------------------------------------------------
+# Top-level search function (called by scraper.py)
+# ---------------------------------------------------------------------------
+
+
+def search_outcode(
+ page,
+ outcode: str,
+ channel: str,
+ pc_index: PostcodeSpatialIndex,
+ pc_coords: dict[str, tuple[float, float]],
+) -> list[dict]:
+ """Search Zoopla for properties in one outcode.
+
+ Takes a live Camoufox Page (from launch_browser). Navigates through the
+ search flow, extracts listings from rendered DOM, and transforms to the
+ standard output schema.
+
+ Raises TurnstileError if Cloudflare blocks us mid-session.
+ """
+ if not _navigate_search(page, outcode, channel):
+ return []
+
+ total_results = _get_result_count(page)
+ if total_results == 0:
+ return []
+
+ raw_listings = _paginate(page, total_results, channel)
+ if not raw_listings:
+ return []
+
+ channel_label = "buy" if channel == "BUY" else "rent"
+ properties = []
+ for raw in raw_listings:
+ transformed = transform_property(raw, channel, pc_index, pc_coords)
+ if transformed:
+ properties.append(transformed)
+ zoopla_properties_scraped.labels(channel=channel_label).inc()
+
+ return properties
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index 057ebbd..5e2a1ce 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -395,6 +395,7 @@ export default function App() {
onUnsaveProperty={user ? savedProperties.deleteProperty : undefined}
isPropertySaved={user ? savedProperties.isPropertySaved : undefined}
getSavedPropertyId={user ? savedProperties.getSavedPropertyId : undefined}
+ deferTutorial={showLicenseSuccess}
/>
)}
{showAuthModal && (
diff --git a/frontend/src/components/account/AccountPage.tsx b/frontend/src/components/account/AccountPage.tsx
index 0e5d34d..6cf8735 100644
--- a/frontend/src/components/account/AccountPage.tsx
+++ b/frontend/src/components/account/AccountPage.tsx
@@ -2,7 +2,7 @@ import { useState, useCallback, useEffect, useRef } from 'react';
import type { AuthUser } from '../../hooks/useAuth';
import type { SavedSearch } from '../../hooks/useSavedSearches';
import type { SavedProperty, SavedPropertyData } from '../../hooks/useSavedProperties';
-import { apiUrl, authHeaders, assertOk, shortenUrl } from '../../lib/api';
+import { apiUrl, authHeaders, assertOk, shortenUrl, prewarmScreenshot } from '../../lib/api';
import { copyToClipboard } from '../../lib/clipboard';
import { formatRelativeTime, formatNumber } from '../../lib/format';
import { summarizeParams } from '../../lib/url-state';
@@ -172,6 +172,7 @@ function SavedSearchesTab({
const handleShare = useCallback(
async (params: string, id: string) => {
+ prewarmScreenshot(params);
setSharingId(id);
try {
const shortUrl = await shortenUrl(params);
@@ -213,7 +214,7 @@ function SavedSearchesTab({
{searches.map((search) => (
{search.screenshotUrl ? (
![]()
)}
-
+
{search.name}
@@ -238,14 +239,14 @@ function SavedSearchesTab({
{summarizeParams(search.params)}
-
+
onUpdateNotes(search.id, notes)}
/>
-
+