164 lines
6.6 KiB
Python
164 lines
6.6 KiB
Python
"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
|
|
|
|
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
|
|
still contains the React Server Components flight stream — so the existing pure
|
|
parsers work unchanged:
|
|
- the search page yields the outcode's listing detail URLs, and
|
|
- each detail page's flight stream carries the property's location object
|
|
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
|
|
listing fields (price/beds/baths/tenure/floor area) parsed here.
|
|
|
|
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
|
|
FlareSolverr session solves the SW9 search + detail pages and the flight data
|
|
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
|
|
|
|
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
|
|
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
|
|
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
|
|
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
|
from spatial import PostcodeSpatialIndex
|
|
from zoopla import _url_with_page, parse_detail_geo, transform_property
|
|
|
|
log = logging.getLogger("zoopla")
|
|
|
|
# Safety bound on how many search-result pages to walk per outcode.
|
|
_MAX_SERP_PAGES = 60
|
|
|
|
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
|
|
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
|
|
|
|
|
|
def _int(pattern: str, buf: str) -> int | None:
|
|
match = re.search(pattern, buf)
|
|
return int(match.group(1)) if match else None
|
|
|
|
|
|
def parse_detail_listing(html: str) -> dict:
|
|
"""Extract the non-location listing fields from a Zoopla detail page.
|
|
|
|
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
|
|
detail page's flight stream (validated against real Zoopla detail HTML).
|
|
All fields are best-effort; missing ones default to None so a listing with
|
|
a known location is still emitted."""
|
|
buf = html.replace('\\"', '"').replace("\\/", "/")
|
|
|
|
price = _int(r'"internalValue":(\d+)', buf)
|
|
if price is None:
|
|
price = _int(r'"priceUnformatted":(\d+)', buf)
|
|
|
|
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
|
|
tenure = tenure_match.group(1).title() if tenure_match else None
|
|
|
|
# Address + property type come from the page <title>, e.g.
|
|
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
|
|
address = None
|
|
property_type = None
|
|
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
|
|
if title_match:
|
|
title = title_match.group(1)
|
|
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
|
|
if addr_match:
|
|
address = addr_match.group(1).strip()
|
|
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
|
|
if type_match:
|
|
property_type = type_match.group(1).strip()
|
|
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
|
|
if explicit_type:
|
|
property_type = explicit_type.group(1)
|
|
|
|
return {
|
|
"price": price,
|
|
"beds": _int(r'"numBedrooms":(\d+)', buf),
|
|
"baths": _int(r'"numBaths":(\d+)', buf),
|
|
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
|
|
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
|
|
"tenure": tenure,
|
|
"property_type": property_type,
|
|
"address": address,
|
|
}
|
|
|
|
|
|
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
|
|
"""Walk the outcode's search-result pages and collect listing detail paths."""
|
|
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
|
seen: list[str] = []
|
|
seen_ids: set[str] = set()
|
|
for page_num in range(1, _MAX_SERP_PAGES + 1):
|
|
url = base if page_num == 1 else _url_with_page(base, page_num)
|
|
html = fs.get(url)
|
|
new = 0
|
|
for path in _DETAIL_PATH_RE.findall(html):
|
|
id_match = _LISTING_ID_RE.search(path)
|
|
listing_id = id_match.group(1) if id_match else path
|
|
if listing_id in seen_ids:
|
|
continue
|
|
seen_ids.add(listing_id)
|
|
seen.append(path)
|
|
new += 1
|
|
if limit is not None and len(seen) >= limit:
|
|
return seen
|
|
if new == 0:
|
|
break
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
return seen
|
|
|
|
|
|
def search_outcode(
|
|
outcode: str,
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
fs: FlareSolverrSession,
|
|
max_properties: int | None = None,
|
|
detail_cap: int = 0,
|
|
detail_budget_seconds: float | None = None,
|
|
) -> tuple[list[dict], str | None]:
|
|
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
|
|
|
|
Every listing's detail page is fetched (that is where the postcode lives),
|
|
so the effective listing count is bounded by both ``max_properties`` and
|
|
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
|
|
limit = detail_cap if detail_cap and detail_cap > 0 else None
|
|
if max_properties is not None:
|
|
limit = max_properties if limit is None else min(limit, max_properties)
|
|
|
|
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
|
paths = _enumerate_detail_paths(fs, outcode, limit)
|
|
if not paths:
|
|
return [], base
|
|
|
|
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
|
|
properties: list[dict] = []
|
|
dropped = 0
|
|
for path in paths:
|
|
if deadline is not None and time.monotonic() >= deadline:
|
|
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
|
|
break
|
|
id_match = _LISTING_ID_RE.search(path)
|
|
listing_id = id_match.group(1) if id_match else path
|
|
try:
|
|
html = fs.get(ZOOPLA_BASE + path)
|
|
geo = parse_detail_geo(html, search_outcode=outcode)
|
|
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
|
|
prop = transform_property(
|
|
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
|
|
)
|
|
except FlareSolverrError as exc:
|
|
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
|
|
prop = None
|
|
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
|
|
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
|
|
prop = None
|
|
if prop:
|
|
properties.append(prop)
|
|
else:
|
|
dropped += 1
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
|
|
return properties, base
|