scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

25
finder/Dockerfile Normal file
View file

@ -0,0 +1,25 @@
# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
# network namespace; the source tree is bind-mounted at runtime, so this image
# only needs the Python deps. The venv lives OUTSIDE the bind-mount target
# (/opt/venv) so the mount doesn't shadow it.
FROM python:3.12-slim
ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy \
PYTHONUNBUFFERED=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates curl \
&& rm -rf /var/lib/apt/lists/*
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
WORKDIR /app/finder
# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
COPY pyproject.toml uv.lock ./
RUN uv sync --no-install-project --frozen
# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
CMD ["sleep", "infinity"]

View file

@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
ARCGIS_PATH = Path(
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
os.environ.get(
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
)
)
PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 0.3
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsi
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
# propertyData.address.{outcode,incode}, which together form the property's TRUE
# full postcode — the search API only exposes the outcode. {id} is the numeric
# listing id from the search response.
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
# The Rightmove search API gives only an outcode-level display address, so the
# true full postcode is recovered from each listing's detail page (see
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
# big throughput increase over the ~1000-result-per-outcode search, so detail
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
# OnTheMarket
ONTHEMARKET_BASE = "https://www.onthemarket.com"
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
# Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk"
# Zoopla search cards only carry an outcode-level address, so the full postcode
# and precise coordinates are scraped from each listing's detail page. These
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
# spent fetching details; the remainder is reserved for search pagination so
# detail fetches can never trip the timeout and discard collected listings.
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
# Gluetun VPN. Network endpoints are env-overridable because they are
# deployment-specific: when finder runs in a SEPARATE container they use the
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
# all traffic, so no HTTP proxy is needed).
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
# Raise only with residential IPs where rotation helps.
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
# Greater London-ish postcode areas. This intentionally uses broad area
# prefixes so a manual scrape can include central/inner London plus common
# outer-London and near-London outcodes without maintaining a long borough list.

57
finder/docker-compose.yml Normal file
View file

@ -0,0 +1,57 @@
# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
# container's network namespace. Everything egresses through the VPN, and
# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
#
# Prerequisites:
# - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
# It is managed by a different compose; it is referenced here as external
# via network_mode "container:media_gluetun".
# - Because these services share gluetun's netns, they reach each other and
# gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
# NO published ports (which is exactly why this avoids the dev-container
# port-forwarding pain).
#
# Usage:
# cd finder
# docker compose up -d --build flaresolverr finder # start the sidecars
# docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
# docker compose exec finder uv run python main.py --source all # full run
# docker compose down
#
# NOTE: a manually-started `finder_flaresolverr` container from testing must be
# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
services:
flaresolverr:
image: ghcr.io/flaresolverr/flaresolverr:latest
container_name: finder_flaresolverr
network_mode: "container:media_gluetun"
environment:
LOG_LEVEL: info
TZ: Europe/London
restart: unless-stopped
finder:
build:
context: .
dockerfile: Dockerfile
image: finder-scraper:latest
container_name: finder_scraper
network_mode: "container:media_gluetun"
depends_on:
- flaresolverr
volumes:
- .:/app/finder # live-mounted finder source
- ../property-data:/app/property-data:ro # ARCGIS postcode data
working_dir: /app/finder
environment:
# Shared netns: sidecars are on localhost, and the netns already tunnels
# all traffic through the VPN, so no HTTP proxy is used.
ZOOPLA_FETCHER: flaresolverr
FLARESOLVERR_URL: http://localhost:8191/v1
GLUETUN_CONTROL_URL: http://localhost:8000
GLUETUN_PROXY: "" # empty => direct (shared netns already tunnels)
DATA_DIR: /app/finder/data
ARCGIS_PATH: /app/property-data/arcgis_data.parquet
restart: "no"
command: ["sleep", "infinity"] # stays up; run scrapes via `docker compose exec`

91
finder/flaresolverr.py Normal file
View file

@ -0,0 +1,91 @@
"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
undetected browser to pass Cloudflare's challenge and returns the fully
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
the Gluetun VPN network namespace, so its browser egresses through the VPN.
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
provided a reused session and a generous maxTimeout (~120s) the first
challenge solve is slow, subsequent requests on the warm session are fast.
"""
import logging
import httpx
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
log = logging.getLogger("flaresolverr")
class FlareSolverrError(Exception):
"""Raised when FlareSolverr cannot fetch/solve a URL."""
class FlareSolverrSession:
"""A reusable FlareSolverr browser session (context manager).
Reusing one session keeps the cleared Cloudflare cookies warm across
requests, so only the first fetch pays the full challenge-solve cost."""
def __init__(
self,
url: str = FLARESOLVERR_URL,
session: str = "finder",
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
) -> None:
self._url = url
self._session = session
self._max_timeout = max_timeout_ms
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
# for up to maxTimeout while solving before responding).
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
self._active = False
def _post(self, payload: dict) -> dict:
try:
resp = self._client.post(self._url, json=payload)
resp.raise_for_status()
data = resp.json()
except (httpx.HTTPError, ValueError) as exc:
raise FlareSolverrError(
f"FlareSolverr request to {self._url} failed: {exc}"
) from exc
if data.get("status") != "ok":
raise FlareSolverrError(
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
)
return data
def __enter__(self) -> "FlareSolverrSession":
# Start from a clean session (ignore destroy errors for a fresh name).
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError:
pass
self._post({"cmd": "sessions.create", "session": self._session})
self._active = True
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
return self
def get(self, url: str) -> str:
"""Fetch a URL through FlareSolverr; return the solved HTML."""
data = self._post(
{
"cmd": "request.get",
"session": self._session,
"url": url,
"maxTimeout": self._max_timeout,
}
)
solution = data.get("solution") or {}
return solution.get("response", "") or ""
def __exit__(self, *exc_info) -> None:
if self._active:
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError as exc:
log.debug("FlareSolverr session destroy failed: %s", exc)
self._client.close()

View file

@ -0,0 +1,53 @@
# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
#
# EA VAP ships as ECW **v2** rasters, which are readable by the open-source
# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
# restrictions entirely.
#
# We build only the ECW driver as a GDAL *plugin* on top of the official runtime
# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
# commit reported by the base image so libgdal and the plugin stay ABI-compatible.
#
# Build: docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
RUN apt-get update && apt-get install -y --no-install-recommends \
cmake g++ make git curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
&& tar -C / -xzf /tmp/libecwj2.tar.gz \
&& rm -f /tmp/libecwj2.tar.gz \
&& (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
done) \
&& ldconfig
# Build the ECW driver plugin against the base image's exact GDAL sources.
RUN set -eux; \
GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
test -n "$GDAL_COMMIT"; \
echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
git checkout -q FETCH_HEAD; \
cmake -S frmts/ecw -B /tmp/ecw-build \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH=/usr \
-DECW_ROOT=/opt/libecwj2-3.3; \
cmake --build /tmp/ecw-build -j"$(nproc)"; \
PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
mkdir -p "$PLUGIN_DIR"; \
find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
rm -rf /tmp/gdal /tmp/ecw-build
# Fail the build if the driver is not actually available.
RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"

View file

@ -5,7 +5,7 @@ import time
import httpx
from fake_useragent import UserAgent
from constants import MAX_RETRIES, RETRY_BASE_DELAY
from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY
log = logging.getLogger("rightmove")
@ -15,10 +15,12 @@ _ua = UserAgent(
def make_client() -> httpx.Client:
# Route through the Gluetun HTTP proxy (VPN egress) when configured.
return httpx.Client(
timeout=30,
headers={"User-Agent": _ua.random, "Accept": "application/json"},
follow_redirects=True,
proxy=GLUETUN_PROXY or None,
)

View file

@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
default=DATA_DIR,
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
)
parser.add_argument(
"--outcodes",
type=str,
default=None,
help=(
"Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
"instead of the full Greater London set. Must fall within the "
"London-ish areas; takes precedence over --test/--limit-outcodes."
),
)
parser.add_argument(
"--limit-outcodes",
type=int,
@ -116,17 +126,32 @@ def main() -> int:
from scraper import (
build_postcode_coords,
build_postcode_index,
filter_londonish_outcodes,
load_outcodes,
run_scrape,
)
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if args.outcodes is not None:
requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
if not requested:
raise SystemExit("--outcodes was empty")
outcodes = filter_londonish_outcodes(requested)
dropped = sorted(set(requested) - set(outcodes))
if dropped:
log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
if not outcodes:
raise SystemExit(
"None of the requested outcodes are within the Greater London-ish areas "
f"({', '.join(requested)})."
)
else:
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if not outcodes:
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")

View file

@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
`humanised-property-type`, `features` (a list where the first element is
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
`?page=N`; the loop terminates when `paginationControls.next` is null.
Postcodes
---------
The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
London, SE5") and a map pin, so the old behaviour derived the postcode from the
nearest postcode to that pin a guess that frequently lands on a neighbouring
unit (the pin can sit on the wrong side of a street boundary).
Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
embeds the property's analytics dataLayer at
`props.initialReduxState.metadata.dataLayer`, which carries the property's own
`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
`property-id`. Crucially this is NOT the agent's office postcode — that lives
separately at `property.agent.postcode` ("SE5 8RS" for the same listing) and
is the classic trap when blindly scanning the page for a postcode. We read the
dataLayer postcode, verify `property-id` matches the listing, and accept it only
when its outcode agrees with the coordinate-nearest postcode (via
``resolve_listing_postcode``) exactly the trust rule the other scrapers use.
Measured over a sample of real listings this yields a trustworthy, usually
exact-unit postcode for ~11/12 listings; the rest safely fall back to the
coordinate-nearest postcode.
Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
"""
import json
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
from transform import (
clean_listing_address,
extract_full_postcode,
extract_outcode,
fix_coords,
map_property_type,
normalize_sub_type,
parse_display_size,
resolve_listing_postcode,
)
log = logging.getLogger("rightmove")
# Detail-page postcode recovery (see module docstring). When enabled, each
# listing's detail page is fetched so its analytics dataLayer postcode — the
# property's own full unit postcode — can replace the coordinate-nearest guess.
# Bounded per outcode so a large outcode can't balloon into unbounded extra
# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
# typical outcode's listings all get their real postcode rather than a
# coordinate-nearest guess.
OTM_FETCH_DETAILS = True
OTM_MAX_DETAILS_PER_OUTCODE = 400
_NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
re.DOTALL,
@ -51,6 +87,11 @@ _HTML_HEADERS = {
"Accept-Language": "en-GB,en;q=0.9",
}
# listingId -> recovered full postcode (or None). Failures are cached too so a
# broken or postcode-less detail page is not re-fetched within a run (the same
# listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
return None
def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
"""Extract the property's own full postcode from an OnTheMarket detail page.
Pure and network-free so it is unit-testable: callers pass `page.content()`
/ the GET body and this does the parsing.
The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
``props.initialReduxState.metadata.dataLayer.postcode`` and is the
property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
agent's office postcode, which sits separately at
``property.agent.postcode`` the trap when scanning a detail page for "a"
postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
match it, guaranteeing we read this listing's postcode and not a stray one.
Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
page has no usable property postcode. Trust (outcode-vs-coordinates
agreement) is enforced later in ``transform_property``.
"""
if not html:
return None
match = _NEXT_DATA_RE.search(html)
if not match:
return None
try:
data = json.loads(match.group(1))
except json.JSONDecodeError:
return None
try:
data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
except (KeyError, TypeError):
return None
if not isinstance(data_layer, dict):
return None
# Guard against reading a different listing's postcode: the dataLayer is the
# property's own analytics payload, so its property-id must match.
if listing_id is not None:
page_id = data_layer.get("property-id")
if page_id is not None and str(page_id) != str(listing_id):
return None
raw_postcode = data_layer.get("postcode")
if not isinstance(raw_postcode, str):
return None
return extract_full_postcode(raw_postcode)
def _fetch_detail_postcode(
client: httpx.Client, details_url: str, listing_id: str
) -> str | None:
"""GET one listing's detail page and return its dataLayer postcode (or None).
Results (including failures) are cached by listing id so a listing that
reappears across overlapping outcode searches is fetched at most once. Plain
HTTPS GET OnTheMarket detail pages have no Cloudflare challenge. Network /
parse errors degrade gracefully to None so the caller falls back to the
coordinate-nearest postcode.
"""
if listing_id in _detail_postcode_cache:
return _detail_postcode_cache[listing_id]
full_url = (
ONTHEMARKET_BASE + details_url
if details_url and not details_url.startswith("http")
else details_url
)
result: str | None = None
if full_url:
for attempt in range(MAX_RETRIES):
try:
resp = client.get(
full_url, headers=_HTML_HEADERS, follow_redirects=True
)
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as exc:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
if resp.status_code == 200:
result = parse_detail_postcode(resp.text, listing_id)
break
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
log.debug(
"OnTheMarket detail %s returned HTTP %d (no postcode)",
listing_id, resp.status_code,
)
break
_detail_postcode_cache[listing_id] = result
return result
def _parse_price(price_value) -> int:
"""Parse a formatted price string like '£450,000' into an integer.
Returns 0 for POA/auction/null values."""
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:
def transform_property(
raw: dict, pc_index: PostcodeSpatialIndex
raw: dict,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None:
"""Transform a raw OnTheMarket listing dict into our output schema."""
"""Transform a raw OnTheMarket listing dict into our output schema.
``detail_postcode`` is the property's own full postcode recovered from its
detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
or ``None`` when no detail fetch was done / no postcode was found. When
present and trustworthy (its outcode agrees with the coordinate-nearest
postcode) it supersedes the coordinate guess and is labelled
``"detail_address"``.
"""
loc = raw.get("location") or {}
raw_lat = loc.get("lat")
raw_lng = loc.get("lon")
@ -184,8 +345,29 @@ def transform_property(
return None
raw_address = raw.get("address", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
# Prefer the property's own detail-page postcode when we have one and it is
# trustworthy. The detail postcode is a full unit postcode (better than the
# coordinate-nearest guess and than the usually outcode-only card address),
# but a stale/mislabelled value would silently override the spatially
# correct one, so apply the same outcode-agreement trust rule the address
# postcode uses: keep it only when its outcode matches the
# coordinate-nearest postcode's outcode.
detail_postcode = extract_full_postcode(detail_postcode)
if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_postcode, "detail_address"
else:
if detail_postcode:
log.debug(
"OnTheMarket %s: rejecting detail postcode %s "
"(outcode mismatch with inferred %s)",
raw.get("id", "?"), detail_postcode, inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
raw_beds = raw.get("bedrooms") or 0
raw_baths = raw.get("bathrooms") or 0
@ -223,6 +405,10 @@ def transform_property(
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
# OnTheMarket search JSON exposes only a street-level address; no UPRN
# or house number/name is available without a detail-page fetch.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": _extract_tenure(features),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),
@ -242,10 +428,17 @@ def search_outcode(
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through OnTheMarket sale results for one outcode."""
"""Paginate through OnTheMarket sale results for one outcode.
When ``OTM_FETCH_DETAILS`` is enabled, up to
``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
fetched for the property's own postcode (see ``_fetch_detail_postcode``);
the rest fall back to the coordinate-nearest postcode.
"""
properties: list[dict] = []
seen_ids: set[str] = set()
page_num = 1
details_fetched = 0
while True:
data = _fetch_page_json(client, outcode, page_num)
@ -269,8 +462,22 @@ def search_outcode(
if listing_id and listing_id in seen_ids:
continue
seen_ids.add(listing_id)
detail_postcode = None
if OTM_FETCH_DETAILS and listing_id:
# Cached lookups are free; only fresh GETs count toward the cap
# and incur the inter-request delay.
cached = listing_id in _detail_postcode_cache
if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
detail_postcode = _fetch_detail_postcode(
client, raw.get("details-url") or "", listing_id
)
if not cached:
details_fetched += 1
time.sleep(DELAY_BETWEEN_PAGES)
try:
transformed = transform_property(raw, pc_index)
transformed = transform_property(raw, pc_index, detail_postcode)
except Exception as exc:
log.warning(
"OnTheMarket %s property %s failed to transform: %s",

View file

@ -1,4 +1,6 @@
import json
import logging
import re
import time
import httpx
@ -6,12 +8,15 @@ import httpx
from constants import (
PAGE_SIZE,
DELAY_BETWEEN_PAGES,
RIGHTMOVE_DETAIL_URL,
RIGHTMOVE_FETCH_DETAILS,
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
SEARCH_URL,
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex
from transform import transform_property
from transform import extract_full_postcode, normalize_postcode, transform_property
log = logging.getLogger("rightmove")
@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
_MAX_INDEX = 1008
# ---------------------------------------------------------------------------
# Detail-page postcode extraction
# ---------------------------------------------------------------------------
#
# The search API (_paginate) only returns an outcode-level `displayAddress`
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
# listing's detail page, however, embeds the property's OWN full postcode in a
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
# This is the property's own postcode, NOT a nearest station/school: the
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
# postcodes, and the address outcode always matches the searched outcode.
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
# new-builds) found the full postcode present 100% of the time. There is no
# UPRN or house-number field anywhere in propertyData, so those stay None.
#
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
# a JSON STRING holding a flat array where every integer inside a container is
# an index reference into that same array (so the graph can dedupe). We
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
# cannot — then rehydrate the reference graph before reading the address.
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
def _extract_page_model_literal(html: str) -> str | None:
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
Brace-matches with string/escape awareness so embedded braces and quotes in
string values don't end the match early. Returns None when absent."""
marker = _PAGE_MODEL_RE.search(html)
if not marker:
return None
start = marker.end()
if start >= len(html) or html[start] != "{":
return None
depth = 0
in_str = False
esc = False
for j in range(start, len(html)):
ch = html[j]
if in_str:
if esc:
esc = False
elif ch == "\\":
esc = True
elif ch == '"':
in_str = False
elif ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return html[start : j + 1]
return None
def _rehydrate(flat: list) -> object:
"""Resolve a devalue-style flattened reference array into a nested object.
Index 0 is the root; every int inside a dict/list is an index back into
``flat``. Memoised so shared/cyclic references resolve once."""
cache: dict[int, object] = {}
def resolve(idx: int) -> object:
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
return None
if idx in cache:
return cache[idx]
node = flat[idx]
if isinstance(node, dict):
out: dict = {}
cache[idx] = out
for key, value in node.items():
out[key] = resolve(value) if isinstance(value, int) else value
return out
if isinstance(node, list):
arr: list = []
cache[idx] = arr
for value in node:
arr.append(resolve(value) if isinstance(value, int) else value)
return arr
cache[idx] = node
return node
return resolve(0)
def parse_detail_postcode(html: str) -> str | None:
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
Pure and network-free so it is unit-testable: callers pass the page HTML.
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
page has no parseable address (the property location wrapper can be empty
the caller then keeps the coordinate fallback). The returned outcode is
re-validated against the joined postcode so a malformed incode is dropped.
"""
if not html:
return None
literal = _extract_page_model_literal(html)
if not literal:
return None
try:
outer = json.loads(literal)
flat = json.loads(outer["data"])
except (ValueError, KeyError, TypeError):
return None
if not isinstance(flat, list) or not flat:
return None
root = _rehydrate(flat)
if not isinstance(root, dict):
return None
property_data = root.get("propertyData")
if not isinstance(property_data, dict):
return None
address = property_data.get("address")
if not isinstance(address, dict):
return None
outcode = address.get("outcode")
incode = address.get("incode")
if not isinstance(outcode, str) or not isinstance(incode, str):
return None
outcode, incode = outcode.strip(), incode.strip()
if not outcode or not incode:
return None
# Round-trip through the shared postcode validator/normaliser: this both
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
# a structurally-valid UK postcode.
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
# listingId -> true full postcode (or None when unavailable). Failures are
# cached too, so a broken/duplicate listing is fetched at most once per run (the
# same listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
"""GET a listing detail page and return its true full postcode (or None).
Results (including failures) are cached by listing id. The detail page is a
plain HTML GET no Cloudflare, unlike Zoopla so a single httpx call
suffices; any error degrades gracefully to the coordinate fallback."""
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
postcode: str | None = None
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
try:
resp = client.get(url, headers={"Accept": "text/html"})
if resp.status_code == 200:
postcode = parse_detail_postcode(resp.text)
else:
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
except httpx.HTTPError as exc:
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
_detail_postcode_cache[property_id] = postcode
return postcode
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in outcode_cache:
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
return None
def _detail_postcode_for(
client: httpx.Client,
prop: dict,
fetch_details: bool,
detail_budget: dict,
) -> str | None:
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
Cached listings are always served (they cost neither a cap slot nor a GET);
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
if not fetch_details:
return None
property_id = str(prop.get("id") or "")
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
if detail_budget["remaining"] <= 0:
return None
detail_budget["remaining"] -= 1
postcode = _fetch_detail_postcode(client, property_id)
time.sleep(DELAY_BETWEEN_PAGES)
return postcode
def _paginate(
client: httpx.Client,
outcode_id: str,
@ -51,11 +251,19 @@ def _paginate(
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
fetch_details: bool = False,
detail_cap: int = 0,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count)."""
"""Paginate through search results. Returns (properties, result_count).
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
their detail page fetched for the property's TRUE full postcode (see
``parse_detail_postcode``); the rest fall back to coordinate-derived
postcodes."""
properties = []
index = 0
result_count = 0
detail_budget = {"remaining": detail_cap}
while True:
params = {
@ -82,7 +290,12 @@ def _paginate(
for prop in raw_props:
try:
transformed = transform_property(prop, outcode, pc_index)
detail_postcode = _detail_postcode_for(
client, prop, fetch_details, detail_budget
)
transformed = transform_property(
prop, outcode, pc_index, detail_postcode=detail_postcode
)
except Exception as exc:
log.warning(
"Rightmove %s/%s property %s failed to transform: %s",
@ -127,7 +340,12 @@ def search_outcode(
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through unfiltered sale results for one outcode+channel."""
"""Paginate through unfiltered sale results for one outcode+channel.
Each listing's detail page is fetched for the property's TRUE full postcode
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
coordinate-derived postcode."""
properties, _ = _paginate(
client,
outcode_id,
@ -135,6 +353,8 @@ def search_outcode(
channel_cfg,
pc_index,
max_properties=max_properties,
fetch_details=RIGHTMOVE_FETCH_DETAILS,
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
)
if max_properties is not None and len(properties) >= max_properties:

View file

@ -15,6 +15,10 @@ from constants import (
DATA_DIR,
DELAY_BETWEEN_OUTCODES,
LONDON_OUTCODE_PREFIXES,
ZOOPLA_DETAIL_BUDGET_FRACTION,
ZOOPLA_FETCH_DETAILS,
ZOOPLA_FETCHER,
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
)
from http_client import make_client
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
return timeout
def _zoopla_detail_cap() -> int:
"""Max detail-page fetches per outcode (0 disables detail fetching).
Zoopla search cards only expose an outcode-level address, so the full
postcode/coordinates come from each listing's detail page. The cap bounds
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
def _open_zoopla_detail_tab(page, detail_cap: int):
"""Open a second tab on the same context for detail-page fetches.
Sharing the persistent context means the detail tab inherits the search
tab's Cloudflare clearance cookies. Returns None when detail fetching is
disabled or the tab cannot be created (the scrape then degrades to
outcode-level postcodes rather than failing)."""
if detail_cap <= 0:
return None
try:
return page.context.new_page()
except Exception as exc:
log.warning(
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
_exception_detail(exc),
)
return None
@contextmanager
def _wall_clock_timeout(seconds: int, label: str):
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
def _scrape_zoopla_flaresolverr(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
from flaresolverr import FlareSolverrError, FlareSolverrSession
from zoopla_flaresolverr import search_outcode as fs_search_outcode
try:
session = FlareSolverrSession(session="zoopla")
session.__enter__()
except FlareSolverrError as exc:
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
return
try:
for outcode in outcodes:
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
if remaining == 0:
log.info("Zoopla cap reached")
return
try:
props, _ = fs_search_outcode(
outcode,
pc_index,
pc_coords,
session,
max_properties=remaining,
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
)
added = _store_properties(results, "zoopla", props, max_properties_per_source)
log.info("Zoopla %s: +%d", outcode, added)
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
_record_error(errors, "zoopla", outcode, exc)
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
session.__exit__(None, None, None)
def _scrape_zoopla(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -446,6 +524,12 @@ def _scrape_zoopla(
errors: list[str],
max_properties_per_source: int | None,
) -> None:
if ZOOPLA_FETCHER == "flaresolverr":
_scrape_zoopla_flaresolverr(
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
)
return
try:
browser, page = _launch_zoopla_with_retries()
except Exception as exc:
@ -454,6 +538,12 @@ def _scrape_zoopla(
return
outcode_timeout = _zoopla_outcode_timeout_seconds()
detail_cap = _zoopla_detail_cap()
detail_page = _open_zoopla_detail_tab(page, detail_cap)
# Spend at most a fraction of each outcode's budget on detail fetches so the
# SIGALRM guard never trips mid-outcode and discards already-collected
# search listings; the rest is left for search pagination and transform.
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
try:
for outcode in outcodes:
@ -470,6 +560,9 @@ def _scrape_zoopla(
pc_index,
pc_coords,
max_properties=None,
detail_page=detail_page,
detail_cap=detail_cap,
detail_budget_seconds=detail_budget_seconds,
)
added = _store_properties(
results,
@ -496,6 +589,8 @@ def _scrape_zoopla(
_close_zoopla_browser(browser, f"zoopla {outcode}")
try:
browser, page = _launch_zoopla_with_retries()
# The old context (and its detail tab) is gone; reopen one.
detail_page = _open_zoopla_detail_tab(page, detail_cap)
log.info("Zoopla %s retrying with fresh browser", outcode)
except Exception as relaunch_exc:
_record_error(errors, "zoopla", outcode, relaunch_exc)
@ -503,6 +598,11 @@ def _scrape_zoopla(
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
if detail_page is not None:
try:
detail_page.close()
except Exception:
pass
_close_zoopla_browser(browser, "zoopla final")

View file

@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"Address per Property Register": [
p["Address per Property Register"] for p in properties
],
# UPRN (when the scraper recovered it) keys an exact listing->EPC
# join; Property number or name is the house identifier for the
# Price-Paid address join. Both are None for sources/listings without
# a detail-page fetch.
"UPRN": [p.get("UPRN") for p in properties],
"Property number or name": [
p.get("Property number or name") for p in properties
],
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
"Property type": [p["Property type"] for p in properties],
"Property sub-type": [p["Property sub-type"] for p in properties],
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"Inferred postcode": pl.Utf8,
"Listing raw address": pl.Utf8,
"Address per Property Register": pl.Utf8,
"UPRN": pl.Utf8,
"Property number or name": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,

206
finder/test_onthemarket.py Normal file
View file

@ -0,0 +1,206 @@
"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
the property's own postcode lives in the analytics dataLayer
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
postcode sits separately under `property.agent.postcode` the trap we must not
fall into.
"""
import json
import onthemarket
from onthemarket import parse_detail_postcode, transform_property
class _StubIndex:
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
def __init__(self, postcode: str | None):
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str | None:
return self._postcode
def _detail_html(
*,
property_id: int = 19522441,
datalayer_postcode: str = "SE5 9AA",
agent_postcode: str = "SE5 8RS",
) -> str:
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
next_data = {
"props": {
"initialReduxState": {
"metadata": {
"dataLayer": {
"page-type": "details-section",
"property-type": "homes",
# The property's own unit postcode.
"postcode": datalayer_postcode,
"property-id": property_id,
"price": "275,000",
"addressline_2": "Padfield Road",
}
},
"property": {
"displayAddress": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
# The agent block carries the AGENT'S office postcode — the
# trap. parse_detail_postcode must not return this.
"agent": {
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
"postcode": agent_postcode,
},
},
}
}
}
payload = json.dumps(next_data)
return (
"<html><body>"
'<script id="__NEXT_DATA__" type="application/json">'
f"{payload}"
"</script></body></html>"
)
# ---------------------------------------------------------------------------
# parse_detail_postcode
# ---------------------------------------------------------------------------
def test_parse_returns_property_postcode_not_agent():
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_normalizes_spacing():
html = _detail_html(datalayer_postcode="se59aa")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_ignores_mismatched_property_id():
# dataLayer postcode belongs to property 19522441; asking for a different
# listing id must refuse to return it.
html = _detail_html(property_id=19522441)
assert parse_detail_postcode(html, "99999999") is None
def test_parse_accepts_when_no_listing_id_given():
html = _detail_html(datalayer_postcode="SE5 9AA")
assert parse_detail_postcode(html, None) == "SE5 9AA"
def test_parse_handles_missing_postcode():
html = _detail_html(datalayer_postcode="")
assert parse_detail_postcode(html, "19522441") is None
def test_parse_handles_no_next_data():
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
def test_parse_handles_empty_html():
assert parse_detail_postcode("", "1") is None
def test_parse_handles_malformed_json():
html = (
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
)
assert parse_detail_postcode(html, "1") is None
def test_parse_handles_missing_datalayer():
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
html = (
'<script id="__NEXT_DATA__" type="application/json">'
f"{json.dumps(next_data)}</script>"
)
assert parse_detail_postcode(html, "1") is None
# ---------------------------------------------------------------------------
# transform_property — detail postcode wiring + trust rule
# ---------------------------------------------------------------------------
_RAW_LISTING = {
"id": "19522441",
"address": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
"bedrooms": 2,
"bathrooms": 1,
"price": "£275,000",
"humanised-property-type": "Apartment",
"features": ["Tenure: Leasehold (99 years remaining)"],
"details-url": "/details/19522441/",
}
def test_transform_uses_trusted_detail_postcode():
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
# the (more precise) detail postcode and label it detail_address.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
assert out is not None
assert out["Postcode"] == "SE5 9AA"
assert out["Postcode source"] == "detail_address"
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
# outcode -> reject the detail postcode, fall back to coordinate logic.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
def test_transform_without_detail_postcode_uses_coordinates():
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
# No UPRN / house number is recoverable from OnTheMarket.
assert out["UPRN"] is None
assert out["Property number or name"] is None
def test_transform_detail_postcode_via_search_address_outcode():
# When the card address already carries a full postcode that agrees with the
# coordinates, the existing "address" source still wins absent a detail
# postcode — detail recovery never regresses that path.
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
index = _StubIndex("SE5 1AA")
out = transform_property(raw, index, detail_postcode=None)
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "address"
# ---------------------------------------------------------------------------
# _fetch_detail_postcode caching (no real network)
# ---------------------------------------------------------------------------
def test_fetch_detail_postcode_is_cached(monkeypatch):
onthemarket._detail_postcode_cache.clear()
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
def _boom(*args, **kwargs): # pragma: no cover - must never be called
raise AssertionError("network was hit despite a cached value")
# Any httpx use would explode; the cache hit must short-circuit first.
result = onthemarket._fetch_detail_postcode(
client=type("C", (), {"get": _boom})(),
details_url="/details/19522441/",
listing_id="19522441",
)
assert result == "SE5 9AA"
onthemarket._detail_postcode_cache.clear()

113
finder/test_rightmove.py Normal file
View file

@ -0,0 +1,113 @@
"""Tests for the Rightmove detail-page postcode extractor.
The search API only returns an outcode-level ``displayAddress``; the property's
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
graph whose ``data`` field is a JSON STRING of a flat array where every integer
inside a container is an index reference into that same array.
"""
import json
from rightmove import _extract_page_model_literal, parse_detail_postcode
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
return (
"<html><head></head><body>\n"
"<script>\n"
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
"</script>\n"
"</body></html>"
)
# A faithful slice of a real listing: root -> propertyData -> address, with a
# decoy nearestStations array (which carries NO postcodes on the live page) to
# prove the parser anchors on the property's own address, not a nearby POI.
_FLAT_SW9 = [
{"propertyData": 1}, # 0: root
{
"id": "89089584",
"address": 2,
"location": 4,
"nearestStations": 6,
}, # 1: propertyData
{
"displayAddress": "Caldwell Street, Stockwell",
"countryCode": "GB",
"ukCountry": "England",
"outcode": "SW9",
"incode": "0HD",
}, # 2: address
None, # 3: filler
{
"latitude": 51.477238,
"longitude": -0.116819,
"pinType": "ACCURATE_POINT",
}, # 4: location
None, # 5: filler
[7, 8], # 6: nearestStations (references)
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
]
def test_parses_full_postcode_from_outcode_and_incode() -> None:
html = _page_model_html(_FLAT_SW9)
assert parse_detail_postcode(html) == "SW9 0HD"
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
# The literal must include the whole nested object, not stop at the first
# closing brace inside the escaped data string.
html = _page_model_html(_FLAT_SW9)
literal = _extract_page_model_literal(html)
assert literal is not None
assert literal.startswith("{") and literal.endswith("}")
# Round-trips back to a dict with the expected top-level keys.
assert set(json.loads(literal)) == {"data", "encoding"}
def test_normalises_unspaced_incode() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
def test_returns_none_when_address_missing() -> None:
# The location wrapper can be empty/absent on some listings; the caller then
# keeps the coordinate fallback, so we must return None (not raise).
flat = [
{"propertyData": 1},
{"id": "1", "location": 2},
{"latitude": 51.5, "longitude": -0.1},
]
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_when_incode_blank() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "incode": ""}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_for_non_postcode_pair() -> None:
# A structurally-invalid outcode/incode pair is rejected by the validator.
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_without_page_model() -> None:
assert parse_detail_postcode("") is None
assert parse_detail_postcode("<html><body>no model</body></html>") is None
# Malformed JSON in the data field degrades gracefully.
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
assert parse_detail_postcode(broken) is None

View file

@ -1,13 +1,19 @@
from transform import (
build_register_address,
clean_listing_address,
extract_full_postcode,
extract_outcode,
resolve_listing_postcode,
transform_property,
)
class StubPostcodeIndex:
def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str:
return "SW1A 9ZZ"
return self._postcode
def test_extract_full_postcode_normalizes_spacing() -> None:
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
def test_build_register_address_prepends_house_number_or_name() -> None:
# House number/name prepended, with the trailing outcode/postcode stripped.
assert (
build_register_address("South Street, Bromley BR1", "12")
== "12, South Street, Bromley"
)
assert (
build_register_address("Riverside, Martham NR29", "Martham Mill")
== "Martham Mill, Riverside, Martham"
)
# No number/name -> identical to the plain cleaned address.
assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
# Already starts with the number/name -> no duplication.
assert (
build_register_address("12 South Street, Bromley", "12")
== "12 South Street, Bromley"
)
# Empty/whitespace number/name is ignored.
assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley"
def test_extract_outcode() -> None:
assert extract_outcode("SW1A 2AA") == "SW1A"
assert extract_outcode("n4 2ha") == "N4"
assert extract_outcode("SW1A2AA") == "SW1A"
assert extract_outcode(None) is None
assert extract_outcode("") is None
def test_resolve_listing_postcode() -> None:
# Outcode matches -> trust the more precise extracted postcode.
assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
# Outcode mismatch -> fall back to the spatially-correct inferred postcode.
assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
# Well-formed but fabricated postcode in a different outcode is rejected.
assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
# No extracted postcode -> inferred is authoritative.
assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
prop = {
"id": "123",
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
assert result["Inferred postcode"] == "SW1A 9ZZ"
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
prop = {
"id": "124",
"location": {"latitude": 51.5, "longitude": -0.1},
"price": {"amount": 750000, "displayPrices": []},
"propertySubType": "Terraced",
"bedrooms": 3,
"bathrooms": 1,
"keyFeatures": [],
"propertyUrl": "/properties/124",
# Address postcode is in a different outcode than the coordinate-nearest one.
"displayAddress": "10 Downing Street, E14 9SS",
}
result = transform_property(prop, "SW1A", StubPostcodeIndex())
assert result is not None
# The spatially-correct inferred postcode wins over the mismatching extracted one.
assert result["Postcode"] == "SW1A 9ZZ"
assert result["Postcode source"] == "coordinates"
assert result["Extracted postcode"] == "E14 9SS"
def _rightmove_prop() -> dict:
return {
"id": "200",
"location": {"latitude": 51.5, "longitude": -0.1},
"price": {"amount": 750000, "displayPrices": []},
"propertySubType": "Terraced",
"bedrooms": 3,
"bathrooms": 1,
"keyFeatures": [],
"propertyUrl": "/properties/200",
# Search API only ever exposes the outcode in the display address.
"displayAddress": "Caldwell Street, Stockwell, SW9",
}
def test_rightmove_transform_prefers_detail_postcode() -> None:
# The detail page's true full postcode (same outcode as the location) is
# preferred over the coordinate-nearest guess.
result = transform_property(
_rightmove_prop(),
"SW9",
StubPostcodeIndex("SW9 7AA"),
detail_postcode="SW9 0HD",
)
assert result is not None
assert result["Postcode"] == "SW9 0HD"
assert result["Postcode source"] == "detail_address"
# The coordinate inference is still surfaced separately.
assert result["Inferred postcode"] == "SW9 7AA"
def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
# A detail postcode whose outcode disagrees with the location must not
# relocate the listing; the coordinate postcode wins instead.
result = transform_property(
_rightmove_prop(),
"SW9",
StubPostcodeIndex("SW9 7AA"),
detail_postcode="E14 9SS",
)
assert result is not None
assert result["Postcode"] == "SW9 7AA"
assert result["Postcode source"] == "coordinates"
def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
# No detail postcode -> behaviour is unchanged (coordinate-nearest).
result = transform_property(
_rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
)
assert result is not None
assert result["Postcode"] == "SW9 7AA"
assert result["Postcode source"] == "coordinates"

288
finder/test_zoopla.py Normal file
View file

@ -0,0 +1,288 @@
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
def test_detail_cache_key_uses_listing_id() -> None:
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
# No id in the URL -> fall back to the URL itself as the key.
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
class StubPostcodeIndex:
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
def __init__(self, postcode: str = "BR1 2AB") -> None:
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str:
return self._postcode
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
# verified detail-page snippet (well inside the England bounds check).
PC_COORDS = {
"BR1 2AB": (51.40, 0.01),
"SW1A 1AA": (51.50, -0.14),
"NR29 4RG": (52.716014, 1.614495),
}
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
# a self.__next_f flight chunk in page.content().
_LOCATION_ESCAPED = (
'<script>self.__next_f.push([1,"...'
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
'..."])</script>'
)
def test_parse_detail_geo_location_object_escaped() -> None:
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
assert geo == {
"lat": 52.716014,
"lng": 1.614495,
"postcode": "NR29 4RG",
"outcode": "NR29",
"source": "detail_location",
"uprn": "10023461458",
"number_or_name": "Martham Mill",
# No `address` twin in this snippet, so there is no full street address.
"full_address": None,
}
def test_parse_detail_geo_location_object_unescaped() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_address_twin() -> None:
html = (
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_address_obj"
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
assert geo["uprn"] == "10023461458"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
# Real detail pages carry both wrappers: the `location` object holds the
# uprn + house number/name, the `address` twin holds the full street
# address. They share a uprn, so the twin's fullAddress is attached.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG",'
'"propertyNumberOrName":"Martham Mill"}'
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["uprn"] == "10023461458"
assert geo["number_or_name"] == "Martham Mill"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
# The only `address` twin on the page belongs to a different uprn (a
# comparable listing). With a uprn to match on, an unrelated twin is never
# borrowed — full_address stays None rather than grabbing the wrong street.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["uprn"] == "10023461458"
assert geo["full_address"] is None
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
# by the property's own "location" wrapper. Anchoring on the wrapper means
# the POI's coordinates are ignored and the property's are returned.
poi = (
'"name":"Martham Community Centre","numberOfConnectors":2,'
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
)
prop = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(poi + prop)
assert geo is not None
assert geo["source"] == "detail_location"
# The property's coords win, not the community centre's.
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
# Page embeds two location objects (e.g. a comparable then the property).
# With a search outcode, the one in that outcode is preferred; without one,
# the first (document order = primary listing) is returned.
comparable = (
'"location":{"outcode":"EN2",'
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
'"postalCode":"EN2 6SN"}'
)
target = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
assert geo is not None and geo["postcode"] == "NR29 4RG"
geo_first = parse_detail_geo(comparable + target)
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
def test_parse_detail_geo_rejects_out_of_england() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":10.0,"longitude":10.0},'
'"uprn":"1","postalCode":"NR29 4RG"}'
)
assert parse_detail_geo(html) is None
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
# keep the coordinates, drop the untrustworthy postcode.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"1","postalCode":"AB12 3CD"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["lat"] == 52.716014
assert geo["postcode"] is None
def test_parse_detail_geo_returns_none_for_garbage() -> None:
assert parse_detail_geo("<html><body>no data here</body></html>") is None
assert parse_detail_geo("") is None
# Coordinates that are not inside a property location/address wrapper (e.g.
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
def _raw(**overrides) -> dict:
raw = {
"id": "123",
"url": "/for-sale/details/123/",
"address": "South Street, Bromley BR1",
"price": 500000,
"beds": 2,
"baths": 1,
"property_type": "Flat",
}
raw.update(overrides)
return raw
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
assert result["Postcode"] == "BR1 3CD"
assert result["Postcode source"] == "detail_address"
assert result["Inferred postcode"] == "BR1 2AB"
assert (result["lat"], result["lon"]) == (51.401, 0.011)
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Mismatching detail postcode is rejected in favour of the spatial value.
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "detail_coordinates"
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
result = transform_property(
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["Postcode"] == "SW1A 1AA"
assert result["Postcode source"] == "detail_address"
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
result = transform_property(
_raw(address="A street with no postcode"),
StubPostcodeIndex(),
PC_COORDS,
search_outcode="BR1",
detail=None,
)
assert result is not None
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "search_outcode"
# No detail page -> no UPRN / house number recovered.
assert result["UPRN"] is None
assert result["Property number or name"] is None
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
detail = {
"lat": 51.401,
"lng": 0.011,
"postcode": "BR1 3CD",
"outcode": "BR1",
"uprn": "100023461458",
"number_or_name": "12",
"full_address": "South Street, Bromley BR1",
}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["UPRN"] == "100023461458"
assert result["Property number or name"] == "12"
# The detail full address replaces the outcode-level card address, and the
# house number is prepended for a near-exact Property Register match.
assert result["Listing raw address"] == "South Street, Bromley BR1"
assert result["Address per Property Register"] == "12, South Street, Bromley"
def test_transform_ignores_out_of_england_detail_coords() -> None:
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Bad detail coords are discarded; falls through to the address outcode (BR1).
assert result["Postcode source"] == "address_outcode"
assert 49 <= result["lat"] <= 56

View file

@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
return normalize_postcode(match.group(1))
def extract_outcode(postcode: str | None) -> str | None:
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA''SW1A'."""
if not postcode:
return None
normalized = normalize_postcode(postcode)
outcode = normalized.split(" ", 1)[0]
return outcode or None
def resolve_listing_postcode(
extracted_postcode: str | None, inferred_postcode: str
) -> tuple[str, str]:
"""Pick the authoritative postcode for a listing, returning (postcode, source).
The address-extracted postcode is more precise than the coordinate-nearest one,
but it is only trustworthy when it agrees with the location: a stale, mistyped or
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
override the spatially-correct value. Since the spatial index only supports
nearest-lookup, accept the extracted postcode only when its outcode matches the
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
inferred one, which is always a real, plausibly-correct postcode.
"""
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
inferred_postcode
):
return extracted_postcode, "address"
if extracted_postcode:
log.debug(
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
extracted_postcode,
inferred_postcode,
)
return inferred_postcode, "coordinates"
def clean_listing_address(address: str | None) -> str:
"""Remove postcode/outcode suffixes from listing display addresses.
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
return cleaned.strip(" ,")
def build_register_address(
raw_address: str | None, number_or_name: str | None = None
) -> str:
"""Build a Property Register-style address, prepending the house number/name.
Listing display addresses are usually street-level ("South Street, Bromley")
because the portals hide the exact unit. When a scraper can recover the
property's own number or name (e.g. Zoopla detail pages expose
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
carries the house identifier that the EPC/Price-Paid register addresses also
use turning a fuzzy street match into a near-exact one. Falls back to the
plain cleaned address when no number/name is available.
"""
cleaned = clean_listing_address(raw_address)
if not number_or_name:
return cleaned
number_or_name = number_or_name.strip()
if not number_or_name:
return cleaned
# Avoid duplicating a number/name the display address already starts with.
if cleaned.lower().startswith(number_or_name.lower()):
return cleaned
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
prop: dict,
outcode: str,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
"""Transform a raw Rightmove property dict into our output schema.
``detail_postcode`` is the property's TRUE full postcode recovered from its
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
only exposes the outcode-level ``displayAddress``. When supplied and it
agrees with the coordinate-nearest postcode's outcode, it is preferred over
the coordinate guess and recorded with source ``"detail_address"``. A
detail postcode whose outcode disagrees with the location is discarded in
favour of the spatially-correct coordinate postcode, so a stale or wrong
detail value can never silently relocate a listing.
"""
loc = prop.get("location")
if not loc:
return None
@ -268,8 +341,25 @@ def transform_property(
return None
raw_address = prop.get("displayAddress", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
# Prefer the detail page's true full postcode when it agrees with the
# location; otherwise fall back to the (display-address-or-coordinate) logic.
detail_full = extract_full_postcode(detail_postcode)
if detail_full and extract_outcode(detail_full) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_full, "detail_address"
else:
if detail_full:
log.debug(
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
"inferred %s)",
detail_full,
inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str):
@ -291,6 +381,9 @@ def transform_property(
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
# Rightmove's displayAddress is street-level; no UPRN/house number.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),

View file

@ -32,16 +32,24 @@ import httpx
from constants import (
DATA_DIR,
DELAY_BETWEEN_PAGES,
GLUETUN_API_KEY,
GLUETUN_CONTROL_URL,
GLUETUN_MAX_ROTATIONS,
GLUETUN_PROXY,
MAX_BEDROOMS,
PROPERTY_TYPE_MAP,
ZOOPLA_BASE,
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
)
from spatial import PostcodeSpatialIndex
from transform import (
clean_listing_address,
build_register_address,
extract_full_postcode,
extract_outcode,
fix_coords,
normalize_sub_type,
parse_int_value,
resolve_listing_postcode,
validate_floor_area,
)
@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
# cookies (bound to the previous IP), then reload and re-check the challenge.
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
def _gluetun_base_url() -> str:
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
return GLUETUN_CONTROL_URL.rstrip("/")
def _gluetun_api_key() -> str | None:
return _GLUETUN_API_KEY
return GLUETUN_API_KEY
def _gluetun_max_rotations() -> int:
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
try:
value = int(raw)
except ValueError as exc:
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
return max(value, 0)
return max(GLUETUN_MAX_ROTATIONS, 0)
def _gluetun_client() -> httpx.Client:
# Talks to the control server directly (not through the VPN proxy).
headers = {}
api_key = _gluetun_api_key()
if api_key:
@ -694,10 +695,19 @@ def launch_browser():
profile_dir.mkdir(parents=True, exist_ok=True)
_remove_stale_profile_locks(profile_dir)
# Route the browser through the Gluetun VPN proxy when configured. (geoip
# fingerprint alignment is intentionally not enabled: it needs the optional
# camoufox[geoip] extra and would spoof to the VPN exit's country, which
# fights the en-GB locale unless the exit is in the UK.)
proxy_options: dict = {}
if GLUETUN_PROXY:
proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
log.info(
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
headless_mode,
profile_dir,
GLUETUN_PROXY or "direct",
)
camoufox = Camoufox(
headless=headless_mode,
@ -705,6 +715,7 @@ def launch_browser():
user_data_dir=str(profile_dir),
locale=["en-GB", "en"],
enable_cache=True,
**proxy_options,
)
raw_browser = camoufox.__enter__()
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
@ -926,13 +937,47 @@ def _paginate(
page,
total_results: int,
max_properties: int | None = None,
fetch_detail=None,
detail_cap: int = 0,
detail_state: dict | None = None,
detail_deadline: float | None = None,
) -> list[dict]:
"""Extract listings from all pages of search results.
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
next link when present, otherwise advance via the pn=N URL parameter while
the advertised result count says more listings remain."""
the advertised result count says more listings remain.
When ``fetch_detail`` is supplied, each listing has its detail page fetched
(up to ``detail_cap`` fresh loads per outcode, counted in the shared
``detail_state`` dict, and only until ``detail_deadline``) and the parsed
geo stored under ``listing['_detail']`` for ``transform_property``. The
detail page is the only source of the listing's UPRN, full street address
and precise postcode, so it is fetched even when the search card already
pins a full postcode. Cached detail results are always attached but cost
neither a cap slot nor a delay."""
def _maybe_fetch(listing: dict) -> None:
if fetch_detail is None or detail_state is None:
return
url = listing.get("url", "")
cached = _detail_cache_key(url) in _detail_cache
if not cached:
# Fresh loads are bounded by the per-outcode cap and the wall-clock
# deadline so detail fetching never starves the SIGALRM budget that
# also guards the search pagination for this outcode.
if detail_state["fetched"] >= detail_cap:
return
if detail_deadline is not None and time.monotonic() >= detail_deadline:
return
listing["_detail"] = fetch_detail(url)
if not cached:
detail_state["fetched"] += 1
time.sleep(DELAY_BETWEEN_PAGES)
all_listings = _extract_listings(page)
for listing in all_listings:
_maybe_fetch(listing)
if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties]
@ -984,6 +1029,7 @@ def _paginate(
if listing["id"] not in seen_ids:
seen_ids.add(listing["id"])
all_listings.append(listing)
_maybe_fetch(listing)
new_count += 1
if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties]
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
return None
# ---------------------------------------------------------------------------
# Detail-page geocoding
# ---------------------------------------------------------------------------
#
# Zoopla search result cards only expose an outcode-level display address (e.g.
# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
# only on each listing's detail page (/for-sale/details/{id}/). The detail page
# is a Next.js App Router route whose React Server Components flight stream
# embeds the property's own location object, e.g.
# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
# "uprn":"10023461458","postalCode":"NR29 4RG",...}
# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
# Nearby points of interest (stations, schools, EV chargers) and comparable
# listings carry their own "coordinates" too, but never inside the property's
# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
# loose coordinates object, is what we anchor on (see parse_detail_geo).
# listingId -> parsed detail dict (or None). Failures are cached too, so a
# broken listing is not re-fetched within a run (the same listing reappears
# across overlapping outcode searches).
_detail_cache: dict[str, dict | None] = {}
_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
# The property's own location is carried by a `"location":{...}` wrapper and a
# twin `"address":{"fullAddress":...}` widget object. We anchor on those
# wrappers (and capture their full object body, which contains exactly one
# nested object — `coordinates`) rather than scanning for loose coordinate
# objects: nearby points of interest (stations/schools/EV chargers) and
# comparable/"similar" listings also embed coordinates, but never inside the
# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
_DETAIL_COORDS_IN_BODY_RE = re.compile(
r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
)
_DETAIL_LATLNG_IN_BODY_RE = re.compile(
r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
)
_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
# The location object spells it "postalCode"; the address twin uses "postcode".
_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
# The UPRN (Unique Property Reference Number) appears in both the location and
# address objects and is the linchpin for an exact listing->EPC join (EPC open
# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
# (e.g. "12", "Martham Mill") only in the location object.
_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
"""Extract the property's own coordinates/postcode from a Zoopla detail page.
Pure and browser-free: the live browser only produces the HTML string
(``page.content()``); this does the parsing so it is unit-testable.
Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
"number_or_name", "full_address"}`` (every field except the coordinates may
be ``None``) or ``None`` when no property location wrapper is found. The
``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
number/name) and ``full_address`` give a register-style address for the
Price Paid join.
Coordinates are bounds-checked to England and a postcode is kept only when
it agrees with its own object's outcode. ``search_outcode``, when given, is
used only as a tie-break to pick the right ``location`` object on pages that
also embed comparable listings. See module docstring for the data model."""
if not html:
return None
# RSC flight strings are embedded as escaped JS string literals, so quotes
# and slashes arrive escaped; normalize them so the regexes match.
buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
def in_england(lat: float, lng: float) -> tuple[float, float] | None:
lat, lng = fix_coords(lat, lng)
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
return None
def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
# outcode and postcode are read from the SAME object body as the coords,
# so the postcode is self-consistent; drop it only if it somehow isn't.
outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
outcode = outcode_match.group(1) if outcode_match else None
postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
if postcode and outcode and extract_outcode(postcode) != outcode.upper():
postcode = None
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
number_or_name = number_match.group(1).strip() if number_match else None
return {
"lat": coords[0],
"lng": coords[1],
"postcode": postcode,
"outcode": outcode,
"source": source,
"uprn": uprn_match.group(1) if uprn_match else None,
"number_or_name": number_or_name or None,
"full_address": full_address,
}
def attach_full_address(result: dict | None) -> dict | None:
# The house-numbered street address lives in the `address` map-widget
# twin, not the `location` wrapper we anchor coordinates on. Pull it from
# the twin that shares this property's uprn; when there is no uprn to
# disambiguate, fall back to the first twin (document order = primary
# listing), but never guess a twin when a uprn exists and none matches —
# that would risk grabbing a comparable listing's address.
if result is None or result.get("full_address"):
return result
target = result.get("uprn")
first = None
for match in _DETAIL_ADDRESS_RE.finditer(buf):
full_address = match.group(1) or None
if full_address is None:
continue
if first is None:
first = full_address
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
if target and uprn_match and uprn_match.group(1) == target:
result["full_address"] = full_address
return result
if target is None:
result["full_address"] = first
return result
# Strategy 1 — the property's own `location` wrapper (authoritative). Take
# the first match (the primary listing precedes any comparables in the
# flight stream), but prefer one whose outcode matches the searched outcode.
first_location = None
for match in _DETAIL_LOCATION_RE.finditer(buf):
body = match.group(1)
coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
if not coords_match:
continue
coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
if not coords:
continue
candidate = build(body, coords, "detail_location")
if first_location is None:
first_location = candidate
if (
search_outcode
and candidate["outcode"]
and candidate["outcode"].upper() == search_outcode.upper()
):
return attach_full_address(candidate)
if first_location is not None:
return attach_full_address(first_location)
# Strategy 2 — the `address` map-widget twin (same coordinates, backup).
for match in _DETAIL_ADDRESS_RE.finditer(buf):
full_address = match.group(1) or None
body = match.group(2)
latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
if not latlng_match:
continue
coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
if coords:
return build(body, coords, "detail_address_obj", full_address=full_address)
return None
def _detail_cache_key(listing_url: str) -> str:
"""Cache key for a listing detail page — its numeric id when present."""
id_match = _LISTING_ID_RE.search(listing_url)
return id_match.group(1) if id_match else listing_url
def _fetch_listing_detail(
detail_page,
listing_url: str,
search_outcode: str | None = None,
) -> dict | None:
"""Load a listing detail page and return its parsed geo dict (or None).
Results (including failures) are cached by listingId. Ordinary navigation
and extraction errors are swallowed so the caller can fall back to
outcode-level resolution, but TurnstileError is allowed to propagate so the
scraper's "Cloudflare ends the run" contract still holds. The goto timeout
is kept short so one slow detail page can't eat the per-outcode budget."""
cache_key = _detail_cache_key(listing_url)
if cache_key in _detail_cache:
return _detail_cache[cache_key]
url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
result: dict | None = None
try:
detail_page.goto(
url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
)
_ensure_not_challenged(detail_page)
html = detail_page.content()
result = parse_detail_geo(html, search_outcode=search_outcode)
except TurnstileError:
raise
except Exception as exc:
log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
result = None
_detail_cache[cache_key] = result
return result
def _map_property_type(raw_type: str | None) -> str:
"""Map Zoopla property type text to canonical type."""
if not raw_type:
@ -1109,28 +1363,64 @@ def transform_property(
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
search_outcode: str | None = None,
detail: dict | None = None,
) -> dict | None:
"""Transform a raw Zoopla listing dict into the standard output schema.
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
Zoopla search cards only expose an outcode-level address, so precise
location comes from the listing's detail page (see ``parse_detail_geo`` /
``_fetch_listing_detail``), passed in as ``detail``. When detail-page
coordinates are available we resolve the nearest postcode via the spatial
index mirroring rightmove/onthemarket and only fall back to the coarse
outcode centroid when no detail location could be obtained."""
price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "") or ""
# Resolve postcode and coordinates from address
extracted_postcode = extract_full_postcode(address)
postcode = extracted_postcode
postcode_source = "address" if extracted_postcode else None
detail = detail or {}
detail_postcode = extract_full_postcode(detail.get("postcode"))
# Detail-page address fields: the UPRN keys an exact EPC join, and the
# full street address / house number-or-name beat the outcode-level card
# address for the Price-Paid join. All three are absent unless the detail
# page was fetched, so every consumer must tolerate None.
detail_uprn = detail.get("uprn") or None
detail_full_address = detail.get("full_address") or None
detail_number_or_name = detail.get("number_or_name") or None
postcode = postcode_source = inferred_postcode = None
lat = lng = None
if postcode:
coords = pc_coords.get(postcode)
if coords:
lat, lng = coords
# (A) Best: detail-page coordinates -> nearest postcode (authoritative).
detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
if detail_lat is not None and detail_lng is not None:
fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
nearest = pc_index.nearest(fixed_lat, fixed_lng)
if nearest:
lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
candidate = detail_postcode or extracted_postcode
postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
postcode_source = (
"detail_address"
if resolved_source == "address"
else "detail_coordinates"
)
# (B) Detail-page postcode without usable coordinates -> geocode it.
if lat is None and detail_postcode and detail_postcode in pc_coords:
lat, lng = pc_coords[detail_postcode]
postcode = inferred_postcode = detail_postcode
postcode_source = "detail_address"
# (C) Full postcode in the search-card address -> geocode it.
if lat is None and extracted_postcode and extracted_postcode in pc_coords:
lat, lng = pc_coords[extracted_postcode]
postcode = extracted_postcode
postcode_source = "address"
# (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
if lat is None:
# Try outcode-level fallback from address text
addr_outcode = _extract_outcode(address)
if addr_outcode:
result = _resolve_outcode_coords(addr_outcode, pc_coords)
@ -1138,7 +1428,6 @@ def transform_property(
postcode, lat, lng = result
postcode_source = "address_outcode"
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords)
if result:
@ -1188,9 +1477,17 @@ def transform_property(
"Postcode": postcode,
"Postcode source": postcode_source or "unknown",
"Extracted postcode": extracted_postcode,
"Inferred postcode": postcode if postcode_source != "address" else None,
"Listing raw address": address,
"Address per Property Register": clean_listing_address(address),
"Inferred postcode": (
inferred_postcode
if inferred_postcode is not None
else (postcode if postcode_source != "address" else None)
),
"Listing raw address": detail_full_address or address,
"Address per Property Register": build_register_address(
detail_full_address or address, detail_number_or_name
),
"UPRN": detail_uprn,
"Property number or name": detail_number_or_name,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),
@ -1215,6 +1512,9 @@ def search_outcode(
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
max_properties: int | None = None,
detail_page=None,
detail_cap: int = 0,
detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]:
"""Search Zoopla for properties in one outcode.
@ -1222,6 +1522,12 @@ def search_outcode(
search flow, extracts listings from rendered DOM, and transforms to the
standard output schema.
When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
are supplied, up to ``detail_cap`` listings per outcode have their detail
page fetched for a precise postcode (see ``_fetch_listing_detail``).
``detail_budget_seconds`` caps the wall-clock time spent fetching details so
the per-outcode timeout that also guards search pagination is never starved.
Returns (properties, search_url).
Raises TurnstileError if Cloudflare blocks us mid-session.
@ -1231,12 +1537,25 @@ def search_outcode(
total_results = _get_result_count(page)
fetch_detail = None
detail_deadline = None
if detail_page is not None and detail_cap > 0:
fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731
detail_page, url, search_outcode=outcode
)
if detail_budget_seconds is not None:
detail_deadline = time.monotonic() + detail_budget_seconds
# Always try extraction even if result count is 0 — the count regex may
# not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate(
page,
total_results,
max_properties=max_properties,
fetch_detail=fetch_detail,
detail_cap=detail_cap,
detail_state={"fetched": 0},
detail_deadline=detail_deadline,
)
if not raw_listings:
if total_results > 0:
@ -1252,7 +1571,11 @@ def search_outcode(
for raw in raw_listings:
try:
transformed = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode
raw,
pc_index,
pc_coords,
search_outcode=outcode,
detail=raw.get("_detail"),
)
except Exception as exc:
log.warning(

View file

@ -0,0 +1,164 @@
"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
still contains the React Server Components flight stream so the existing pure
parsers work unchanged:
- the search page yields the outcode's listing detail URLs, and
- each detail page's flight stream carries the property's location object
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
listing fields (price/beds/baths/tenure/floor area) parsed here.
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
FlareSolverr session solves the SW9 search + detail pages and the flight data
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
"""
import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
from flaresolverr import FlareSolverrError, FlareSolverrSession
from spatial import PostcodeSpatialIndex
from zoopla import _url_with_page, parse_detail_geo, transform_property
log = logging.getLogger("zoopla")
# Safety bound on how many search-result pages to walk per outcode.
_MAX_SERP_PAGES = 60
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
def _int(pattern: str, buf: str) -> int | None:
match = re.search(pattern, buf)
return int(match.group(1)) if match else None
def parse_detail_listing(html: str) -> dict:
"""Extract the non-location listing fields from a Zoopla detail page.
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
detail page's flight stream (validated against real Zoopla detail HTML).
All fields are best-effort; missing ones default to None so a listing with
a known location is still emitted."""
buf = html.replace('\\"', '"').replace("\\/", "/")
price = _int(r'"internalValue":(\d+)', buf)
if price is None:
price = _int(r'"priceUnformatted":(\d+)', buf)
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
tenure = tenure_match.group(1).title() if tenure_match else None
# Address + property type come from the page <title>, e.g.
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
address = None
property_type = None
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
if title_match:
title = title_match.group(1)
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
if addr_match:
address = addr_match.group(1).strip()
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
if type_match:
property_type = type_match.group(1).strip()
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
if explicit_type:
property_type = explicit_type.group(1)
return {
"price": price,
"beds": _int(r'"numBedrooms":(\d+)', buf),
"baths": _int(r'"numBaths":(\d+)', buf),
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
"tenure": tenure,
"property_type": property_type,
"address": address,
}
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
"""Walk the outcode's search-result pages and collect listing detail paths."""
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
seen: list[str] = []
seen_ids: set[str] = set()
for page_num in range(1, _MAX_SERP_PAGES + 1):
url = base if page_num == 1 else _url_with_page(base, page_num)
html = fs.get(url)
new = 0
for path in _DETAIL_PATH_RE.findall(html):
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
if listing_id in seen_ids:
continue
seen_ids.add(listing_id)
seen.append(path)
new += 1
if limit is not None and len(seen) >= limit:
return seen
if new == 0:
break
time.sleep(DELAY_BETWEEN_PAGES)
return seen
def search_outcode(
outcode: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
fs: FlareSolverrSession,
max_properties: int | None = None,
detail_cap: int = 0,
detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]:
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
Every listing's detail page is fetched (that is where the postcode lives),
so the effective listing count is bounded by both ``max_properties`` and
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
limit = detail_cap if detail_cap and detail_cap > 0 else None
if max_properties is not None:
limit = max_properties if limit is None else min(limit, max_properties)
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
paths = _enumerate_detail_paths(fs, outcode, limit)
if not paths:
return [], base
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
properties: list[dict] = []
dropped = 0
for path in paths:
if deadline is not None and time.monotonic() >= deadline:
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
break
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
try:
html = fs.get(ZOOPLA_BASE + path)
geo = parse_detail_geo(html, search_outcode=outcode)
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
prop = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
)
except FlareSolverrError as exc:
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
prop = None
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
prop = None
if prop:
properties.append(prop)
else:
dropped += 1
time.sleep(DELAY_BETWEEN_PAGES)
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
return properties, base