"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
still contains the React Server Components flight stream — so the existing pure
parsers work unchanged:
- the search page yields the outcode's listing detail URLs, and
- each detail page's flight stream carries the property's location object
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
listing fields (price/beds/baths/tenure/floor area) parsed here.
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
FlareSolverr session solves the SW9 search + detail pages and the flight data
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
"""
import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
from flaresolverr import FlareSolverrError, FlareSolverrSession
from spatial import PostcodeSpatialIndex
from zoopla import _url_with_page, parse_detail_geo, transform_property
log = logging.getLogger("zoopla")
# Safety bound on how many search-result pages to walk per outcode.
_MAX_SERP_PAGES = 60
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
def _int(pattern: str, buf: str) -> int | None:
match = re.search(pattern, buf)
return int(match.group(1)) if match else None
def parse_detail_listing(html: str) -> dict:
"""Extract the non-location listing fields from a Zoopla detail page.
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
detail page's flight stream (validated against real Zoopla detail HTML).
All fields are best-effort; missing ones default to None so a listing with
a known location is still emitted."""
buf = html.replace('\\"', '"').replace("\\/", "/")
price = _int(r'"internalValue":(\d+)', buf)
if price is None:
price = _int(r'"priceUnformatted":(\d+)', buf)
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
tenure = tenure_match.group(1).title() if tenure_match else None
# Address + property type come from the page
, e.g.
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
address = None
property_type = None
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
if title_match:
title = title_match.group(1)
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
if addr_match:
address = addr_match.group(1).strip()
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
if type_match:
property_type = type_match.group(1).strip()
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
if explicit_type:
property_type = explicit_type.group(1)
return {
"price": price,
"beds": _int(r'"numBedrooms":(\d+)', buf),
"baths": _int(r'"numBaths":(\d+)', buf),
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
"tenure": tenure,
"property_type": property_type,
"address": address,
}
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
"""Walk the outcode's search-result pages and collect listing detail paths."""
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
seen: list[str] = []
seen_ids: set[str] = set()
for page_num in range(1, _MAX_SERP_PAGES + 1):
url = base if page_num == 1 else _url_with_page(base, page_num)
html = fs.get(url)
new = 0
for path in _DETAIL_PATH_RE.findall(html):
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
if listing_id in seen_ids:
continue
seen_ids.add(listing_id)
seen.append(path)
new += 1
if limit is not None and len(seen) >= limit:
return seen
if new == 0:
break
time.sleep(DELAY_BETWEEN_PAGES)
return seen
def search_outcode(
outcode: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
fs: FlareSolverrSession,
max_properties: int | None = None,
detail_cap: int = 0,
detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]:
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
Every listing's detail page is fetched (that is where the postcode lives),
so the effective listing count is bounded by both ``max_properties`` and
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
limit = detail_cap if detail_cap and detail_cap > 0 else None
if max_properties is not None:
limit = max_properties if limit is None else min(limit, max_properties)
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
paths = _enumerate_detail_paths(fs, outcode, limit)
if not paths:
return [], base
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
properties: list[dict] = []
dropped = 0
for path in paths:
if deadline is not None and time.monotonic() >= deadline:
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
break
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
try:
html = fs.get(ZOOPLA_BASE + path)
geo = parse_detail_geo(html, search_outcode=outcode)
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
prop = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
)
except FlareSolverrError as exc:
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
prop = None
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
prop = None
if prop:
properties.append(prop)
else:
dropped += 1
time.sleep(DELAY_BETWEEN_PAGES)
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
return properties, base