perfect-postcode/finder/zoopla_flaresolverr.py

"""Zoopla scraping via FlareSolverr (no browser/VNC needed).

FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
still contains the React Server Components flight stream — so the existing pure
parsers work unchanged:
  - the search page yields the outcode's listing detail URLs, and
  - each detail page's flight stream carries the property's location object
    (postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
    listing fields (price/beds/baths/tenure/floor area) parsed here.

Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
FlareSolverr session solves the SW9 search + detail pages and the flight data
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).

This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
"""

import logging
import re
import time

from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
from flaresolverr import FlareSolverrError, FlareSolverrSession
from spatial import PostcodeSpatialIndex
from zoopla import _url_with_page, parse_detail_geo, transform_property

log = logging.getLogger("zoopla")

# Safety bound on how many search-result pages to walk per outcode.
_MAX_SERP_PAGES = 60

_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")


def _int(pattern: str, buf: str) -> int | None:
    match = re.search(pattern, buf)
    return int(match.group(1)) if match else None


def parse_detail_listing(html: str) -> dict:
    """Extract the non-location listing fields from a Zoopla detail page.

    Mirrors the fields the Camoufox SERP-card extractor produced, read from the
    detail page's flight stream (validated against real Zoopla detail HTML).
    All fields are best-effort; missing ones default to None so a listing with
    a known location is still emitted."""
    buf = html.replace('\\"', '"').replace("\\/", "/")

    price = _int(r'"internalValue":(\d+)', buf)
    if price is None:
        price = _int(r'"priceUnformatted":(\d+)', buf)

    tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
    tenure = tenure_match.group(1).title() if tenure_match else None

    # Address + property type come from the page <title>, e.g.
    # "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
    address = None
    property_type = None
    title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
    if title_match:
        title = title_match.group(1)
        addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
        if addr_match:
            address = addr_match.group(1).strip()
        type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
        if type_match:
            property_type = type_match.group(1).strip()
    explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
    if explicit_type:
        property_type = explicit_type.group(1)

    return {
        "price": price,
        "beds": _int(r'"numBedrooms":(\d+)', buf),
        "baths": _int(r'"numBaths":(\d+)', buf),
        "receptions": _int(r'"numLivingRooms":(\d+)', buf),
        "floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
        "tenure": tenure,
        "property_type": property_type,
        "address": address,
    }


def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
    """Walk the outcode's search-result pages and collect listing detail paths."""
    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
    seen: list[str] = []
    seen_ids: set[str] = set()
    for page_num in range(1, _MAX_SERP_PAGES + 1):
        url = base if page_num == 1 else _url_with_page(base, page_num)
        html = fs.get(url)
        new = 0
        for path in _DETAIL_PATH_RE.findall(html):
            id_match = _LISTING_ID_RE.search(path)
            listing_id = id_match.group(1) if id_match else path
            if listing_id in seen_ids:
                continue
            seen_ids.add(listing_id)
            seen.append(path)
            new += 1
            if limit is not None and len(seen) >= limit:
                return seen
        if new == 0:
            break
        time.sleep(DELAY_BETWEEN_PAGES)
    return seen


def search_outcode(
    outcode: str,
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    fs: FlareSolverrSession,
    max_properties: int | None = None,
    detail_cap: int = 0,
    detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]:
    """Scrape one outcode via FlareSolverr. Returns (properties, search_url).

    Every listing's detail page is fetched (that is where the postcode lives),
    so the effective listing count is bounded by both ``max_properties`` and
    ``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
    limit = detail_cap if detail_cap and detail_cap > 0 else None
    if max_properties is not None:
        limit = max_properties if limit is None else min(limit, max_properties)

    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
    paths = _enumerate_detail_paths(fs, outcode, limit)
    if not paths:
        return [], base

    deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
    properties: list[dict] = []
    dropped = 0
    for path in paths:
        if deadline is not None and time.monotonic() >= deadline:
            log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
            break
        id_match = _LISTING_ID_RE.search(path)
        listing_id = id_match.group(1) if id_match else path
        try:
            html = fs.get(ZOOPLA_BASE + path)
            geo = parse_detail_geo(html, search_outcode=outcode)
            raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
            prop = transform_property(
                raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
            )
        except FlareSolverrError as exc:
            log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
            prop = None
        except Exception as exc:  # noqa: BLE001 - never let one listing kill the outcode
            log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
            prop = None
        if prop:
            properties.append(prop)
        else:
            dropped += 1
        time.sleep(DELAY_BETWEEN_PAGES)

    log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
    return properties, base