scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
25
finder/Dockerfile
Normal file
25
finder/Dockerfile
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
|
||||
# network namespace; the source tree is bind-mounted at runtime, so this image
|
||||
# only needs the Python deps. The venv lives OUTSIDE the bind-mount target
|
||||
# (/opt/venv) so the mount doesn't shadow it.
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
|
||||
UV_COMPILE_BYTECODE=1 \
|
||||
UV_LINK_MODE=copy \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||
|
||||
WORKDIR /app/finder
|
||||
|
||||
# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
|
||||
COPY pyproject.toml uv.lock ./
|
||||
RUN uv sync --no-install-project --frozen
|
||||
|
||||
# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
|
||||
CMD ["sleep", "infinity"]
|
||||
|
|
@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent
|
|||
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
|
||||
ARCGIS_PATH = Path(
|
||||
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
|
||||
os.environ.get(
|
||||
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
|
||||
)
|
||||
)
|
||||
PAGE_SIZE = 24
|
||||
DELAY_BETWEEN_PAGES = 0.3
|
||||
|
|
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsi
|
|||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
|
||||
# propertyData.address.{outcode,incode}, which together form the property's TRUE
|
||||
# full postcode — the search API only exposes the outcode. {id} is the numeric
|
||||
# listing id from the search response.
|
||||
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
|
||||
|
||||
# The Rightmove search API gives only an outcode-level display address, so the
|
||||
# true full postcode is recovered from each listing's detail page (see
|
||||
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
|
||||
# big throughput increase over the ~1000-result-per-outcode search, so detail
|
||||
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
|
||||
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
|
||||
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
||||
|
||||
# OnTheMarket
|
||||
ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
||||
|
|
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
|||
# Zoopla
|
||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||
|
||||
# Zoopla search cards only carry an outcode-level address, so the full postcode
|
||||
# and precise coordinates are scraped from each listing's detail page. These
|
||||
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
|
||||
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
|
||||
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
||||
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
|
||||
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
|
||||
# spent fetching details; the remainder is reserved for search pagination so
|
||||
# detail fetches can never trip the timeout and discard collected listings.
|
||||
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
|
||||
|
||||
# Gluetun VPN. Network endpoints are env-overridable because they are
|
||||
# deployment-specific: when finder runs in a SEPARATE container they use the
|
||||
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
|
||||
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
|
||||
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
|
||||
# all traffic, so no HTTP proxy is needed).
|
||||
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
|
||||
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
|
||||
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
|
||||
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
||||
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
|
||||
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
|
||||
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
|
||||
# Raise only with residential IPs where rotation helps.
|
||||
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
|
||||
|
||||
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
|
||||
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
|
||||
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
|
||||
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
|
||||
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
|
||||
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
|
||||
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
|
||||
|
||||
# Greater London-ish postcode areas. This intentionally uses broad area
|
||||
# prefixes so a manual scrape can include central/inner London plus common
|
||||
# outer-London and near-London outcodes without maintaining a long borough list.
|
||||
|
|
|
|||
57
finder/docker-compose.yml
Normal file
57
finder/docker-compose.yml
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
|
||||
# container's network namespace. Everything egresses through the VPN, and
|
||||
# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
|
||||
#
|
||||
# Prerequisites:
|
||||
# - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
|
||||
# It is managed by a different compose; it is referenced here as external
|
||||
# via network_mode "container:media_gluetun".
|
||||
# - Because these services share gluetun's netns, they reach each other and
|
||||
# gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
|
||||
# NO published ports (which is exactly why this avoids the dev-container
|
||||
# port-forwarding pain).
|
||||
#
|
||||
# Usage:
|
||||
# cd finder
|
||||
# docker compose up -d --build flaresolverr finder # start the sidecars
|
||||
# docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
|
||||
# docker compose exec finder uv run python main.py --source all # full run
|
||||
# docker compose down
|
||||
#
|
||||
# NOTE: a manually-started `finder_flaresolverr` container from testing must be
|
||||
# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
|
||||
|
||||
services:
|
||||
flaresolverr:
|
||||
image: ghcr.io/flaresolverr/flaresolverr:latest
|
||||
container_name: finder_flaresolverr
|
||||
network_mode: "container:media_gluetun"
|
||||
environment:
|
||||
LOG_LEVEL: info
|
||||
TZ: Europe/London
|
||||
restart: unless-stopped
|
||||
|
||||
finder:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: finder-scraper:latest
|
||||
container_name: finder_scraper
|
||||
network_mode: "container:media_gluetun"
|
||||
depends_on:
|
||||
- flaresolverr
|
||||
volumes:
|
||||
- .:/app/finder # live-mounted finder source
|
||||
- ../property-data:/app/property-data:ro # ARCGIS postcode data
|
||||
working_dir: /app/finder
|
||||
environment:
|
||||
# Shared netns: sidecars are on localhost, and the netns already tunnels
|
||||
# all traffic through the VPN, so no HTTP proxy is used.
|
||||
ZOOPLA_FETCHER: flaresolverr
|
||||
FLARESOLVERR_URL: http://localhost:8191/v1
|
||||
GLUETUN_CONTROL_URL: http://localhost:8000
|
||||
GLUETUN_PROXY: "" # empty => direct (shared netns already tunnels)
|
||||
DATA_DIR: /app/finder/data
|
||||
ARCGIS_PATH: /app/property-data/arcgis_data.parquet
|
||||
restart: "no"
|
||||
command: ["sleep", "infinity"] # stays up; run scrapes via `docker compose exec`
|
||||
91
finder/flaresolverr.py
Normal file
91
finder/flaresolverr.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
|
||||
|
||||
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
|
||||
undetected browser to pass Cloudflare's challenge and returns the fully
|
||||
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
|
||||
the Gluetun VPN network namespace, so its browser egresses through the VPN.
|
||||
|
||||
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
|
||||
provided a reused session and a generous maxTimeout (~120s) — the first
|
||||
challenge solve is slow, subsequent requests on the warm session are fast.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
|
||||
|
||||
log = logging.getLogger("flaresolverr")
|
||||
|
||||
|
||||
class FlareSolverrError(Exception):
|
||||
"""Raised when FlareSolverr cannot fetch/solve a URL."""
|
||||
|
||||
|
||||
class FlareSolverrSession:
|
||||
"""A reusable FlareSolverr browser session (context manager).
|
||||
|
||||
Reusing one session keeps the cleared Cloudflare cookies warm across
|
||||
requests, so only the first fetch pays the full challenge-solve cost."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str = FLARESOLVERR_URL,
|
||||
session: str = "finder",
|
||||
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
|
||||
) -> None:
|
||||
self._url = url
|
||||
self._session = session
|
||||
self._max_timeout = max_timeout_ms
|
||||
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
|
||||
# for up to maxTimeout while solving before responding).
|
||||
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
|
||||
self._active = False
|
||||
|
||||
def _post(self, payload: dict) -> dict:
|
||||
try:
|
||||
resp = self._client.post(self._url, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except (httpx.HTTPError, ValueError) as exc:
|
||||
raise FlareSolverrError(
|
||||
f"FlareSolverr request to {self._url} failed: {exc}"
|
||||
) from exc
|
||||
if data.get("status") != "ok":
|
||||
raise FlareSolverrError(
|
||||
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
|
||||
)
|
||||
return data
|
||||
|
||||
def __enter__(self) -> "FlareSolverrSession":
|
||||
# Start from a clean session (ignore destroy errors for a fresh name).
|
||||
try:
|
||||
self._post({"cmd": "sessions.destroy", "session": self._session})
|
||||
except FlareSolverrError:
|
||||
pass
|
||||
self._post({"cmd": "sessions.create", "session": self._session})
|
||||
self._active = True
|
||||
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
|
||||
return self
|
||||
|
||||
def get(self, url: str) -> str:
|
||||
"""Fetch a URL through FlareSolverr; return the solved HTML."""
|
||||
data = self._post(
|
||||
{
|
||||
"cmd": "request.get",
|
||||
"session": self._session,
|
||||
"url": url,
|
||||
"maxTimeout": self._max_timeout,
|
||||
}
|
||||
)
|
||||
solution = data.get("solution") or {}
|
||||
return solution.get("response", "") or ""
|
||||
|
||||
def __exit__(self, *exc_info) -> None:
|
||||
if self._active:
|
||||
try:
|
||||
self._post({"cmd": "sessions.destroy", "session": self._session})
|
||||
except FlareSolverrError as exc:
|
||||
log.debug("FlareSolverr session destroy failed: %s", exc)
|
||||
self._client.close()
|
||||
53
finder/gdal-ecw/Dockerfile
Normal file
53
finder/gdal-ecw/Dockerfile
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
|
||||
# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
|
||||
#
|
||||
# EA VAP ships as ECW **v2** rasters, which are readable by the open-source
|
||||
# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
|
||||
# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
|
||||
# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
|
||||
# restrictions entirely.
|
||||
#
|
||||
# We build only the ECW driver as a GDAL *plugin* on top of the official runtime
|
||||
# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
|
||||
# commit reported by the base image so libgdal and the plugin stay ABI-compatible.
|
||||
#
|
||||
# Build: docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
|
||||
# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
|
||||
|
||||
FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
|
||||
|
||||
ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
cmake g++ make git curl ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
|
||||
RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
|
||||
&& tar -C / -xzf /tmp/libecwj2.tar.gz \
|
||||
&& rm -f /tmp/libecwj2.tar.gz \
|
||||
&& (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
|
||||
ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
|
||||
done) \
|
||||
&& ldconfig
|
||||
|
||||
# Build the ECW driver plugin against the base image's exact GDAL sources.
|
||||
RUN set -eux; \
|
||||
GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
|
||||
test -n "$GDAL_COMMIT"; \
|
||||
echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
|
||||
mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
|
||||
git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
|
||||
git checkout -q FETCH_HEAD; \
|
||||
cmake -S frmts/ecw -B /tmp/ecw-build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_PREFIX_PATH=/usr \
|
||||
-DECW_ROOT=/opt/libecwj2-3.3; \
|
||||
cmake --build /tmp/ecw-build -j"$(nproc)"; \
|
||||
PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
|
||||
mkdir -p "$PLUGIN_DIR"; \
|
||||
find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
|
||||
rm -rf /tmp/gdal /tmp/ecw-build
|
||||
|
||||
# Fail the build if the driver is not actually available.
|
||||
RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"
|
||||
|
|
@ -5,7 +5,7 @@ import time
|
|||
import httpx
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
from constants import MAX_RETRIES, RETRY_BASE_DELAY
|
||||
from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
|
@ -15,10 +15,12 @@ _ua = UserAgent(
|
|||
|
||||
|
||||
def make_client() -> httpx.Client:
|
||||
# Route through the Gluetun HTTP proxy (VPN egress) when configured.
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
headers={"User-Agent": _ua.random, "Accept": "application/json"},
|
||||
follow_redirects=True,
|
||||
proxy=GLUETUN_PROXY or None,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
|
|||
default=DATA_DIR,
|
||||
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outcodes",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
|
||||
"instead of the full Greater London set. Must fall within the "
|
||||
"London-ish areas; takes precedence over --test/--limit-outcodes."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit-outcodes",
|
||||
type=int,
|
||||
|
|
@ -116,17 +126,32 @@ def main() -> int:
|
|||
from scraper import (
|
||||
build_postcode_coords,
|
||||
build_postcode_index,
|
||||
filter_londonish_outcodes,
|
||||
load_outcodes,
|
||||
run_scrape,
|
||||
)
|
||||
|
||||
outcodes = load_outcodes()
|
||||
if args.test and args.limit_outcodes is None:
|
||||
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
||||
if preferred:
|
||||
outcodes = preferred
|
||||
if args.limit_outcodes is not None:
|
||||
outcodes = outcodes[: args.limit_outcodes]
|
||||
if args.outcodes is not None:
|
||||
requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
|
||||
if not requested:
|
||||
raise SystemExit("--outcodes was empty")
|
||||
outcodes = filter_londonish_outcodes(requested)
|
||||
dropped = sorted(set(requested) - set(outcodes))
|
||||
if dropped:
|
||||
log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
|
||||
if not outcodes:
|
||||
raise SystemExit(
|
||||
"None of the requested outcodes are within the Greater London-ish areas "
|
||||
f"({', '.join(requested)})."
|
||||
)
|
||||
else:
|
||||
outcodes = load_outcodes()
|
||||
if args.test and args.limit_outcodes is None:
|
||||
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
||||
if preferred:
|
||||
outcodes = preferred
|
||||
if args.limit_outcodes is not None:
|
||||
outcodes = outcodes[: args.limit_outcodes]
|
||||
|
||||
if not outcodes:
|
||||
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
|
||||
|
|
|
|||
|
|
@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
|
|||
`humanised-property-type`, `features` (a list where the first element is
|
||||
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
|
||||
`?page=N`; the loop terminates when `paginationControls.next` is null.
|
||||
|
||||
Postcodes
|
||||
---------
|
||||
The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
|
||||
London, SE5") and a map pin, so the old behaviour derived the postcode from the
|
||||
nearest postcode to that pin — a guess that frequently lands on a neighbouring
|
||||
unit (the pin can sit on the wrong side of a street boundary).
|
||||
|
||||
Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
|
||||
embeds the property's analytics dataLayer at
|
||||
`props.initialReduxState.metadata.dataLayer`, which carries the property's own
|
||||
`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
|
||||
`property-id`. Crucially this is NOT the agent's office postcode — that lives
|
||||
separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and
|
||||
is the classic trap when blindly scanning the page for a postcode. We read the
|
||||
dataLayer postcode, verify `property-id` matches the listing, and accept it only
|
||||
when its outcode agrees with the coordinate-nearest postcode (via
|
||||
``resolve_listing_postcode``) — exactly the trust rule the other scrapers use.
|
||||
Measured over a sample of real listings this yields a trustworthy, usually
|
||||
exact-unit postcode for ~11/12 listings; the rest safely fall back to the
|
||||
coordinate-nearest postcode.
|
||||
|
||||
Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
|
||||
``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
|
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
|
|||
from transform import (
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
extract_outcode,
|
||||
fix_coords,
|
||||
map_property_type,
|
||||
normalize_sub_type,
|
||||
parse_display_size,
|
||||
resolve_listing_postcode,
|
||||
)
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
# Detail-page postcode recovery (see module docstring). When enabled, each
|
||||
# listing's detail page is fetched so its analytics dataLayer postcode — the
|
||||
# property's own full unit postcode — can replace the coordinate-nearest guess.
|
||||
# Bounded per outcode so a large outcode can't balloon into unbounded extra
|
||||
# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
|
||||
# typical outcode's listings all get their real postcode rather than a
|
||||
# coordinate-nearest guess.
|
||||
OTM_FETCH_DETAILS = True
|
||||
OTM_MAX_DETAILS_PER_OUTCODE = 400
|
||||
|
||||
_NEXT_DATA_RE = re.compile(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
re.DOTALL,
|
||||
|
|
@ -51,6 +87,11 @@ _HTML_HEADERS = {
|
|||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
}
|
||||
|
||||
# listingId -> recovered full postcode (or None). Failures are cached too so a
|
||||
# broken or postcode-less detail page is not re-fetched within a run (the same
|
||||
# listing can reappear across overlapping outcode searches).
|
||||
_detail_postcode_cache: dict[str, str | None] = {}
|
||||
|
||||
|
||||
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
|
||||
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
|
||||
|
|
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
|
|||
return None
|
||||
|
||||
|
||||
def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
|
||||
"""Extract the property's own full postcode from an OnTheMarket detail page.
|
||||
|
||||
Pure and network-free so it is unit-testable: callers pass `page.content()`
|
||||
/ the GET body and this does the parsing.
|
||||
|
||||
The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
|
||||
``props.initialReduxState.metadata.dataLayer.postcode`` and is the
|
||||
property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
|
||||
agent's office postcode, which sits separately at
|
||||
``…property.agent.postcode`` — the trap when scanning a detail page for "a"
|
||||
postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
|
||||
match it, guaranteeing we read this listing's postcode and not a stray one.
|
||||
|
||||
Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
|
||||
page has no usable property postcode. Trust (outcode-vs-coordinates
|
||||
agreement) is enforced later in ``transform_property``.
|
||||
"""
|
||||
if not html:
|
||||
return None
|
||||
|
||||
match = _NEXT_DATA_RE.search(html)
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
try:
|
||||
data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
|
||||
except (KeyError, TypeError):
|
||||
return None
|
||||
if not isinstance(data_layer, dict):
|
||||
return None
|
||||
|
||||
# Guard against reading a different listing's postcode: the dataLayer is the
|
||||
# property's own analytics payload, so its property-id must match.
|
||||
if listing_id is not None:
|
||||
page_id = data_layer.get("property-id")
|
||||
if page_id is not None and str(page_id) != str(listing_id):
|
||||
return None
|
||||
|
||||
raw_postcode = data_layer.get("postcode")
|
||||
if not isinstance(raw_postcode, str):
|
||||
return None
|
||||
return extract_full_postcode(raw_postcode)
|
||||
|
||||
|
||||
def _fetch_detail_postcode(
|
||||
client: httpx.Client, details_url: str, listing_id: str
|
||||
) -> str | None:
|
||||
"""GET one listing's detail page and return its dataLayer postcode (or None).
|
||||
|
||||
Results (including failures) are cached by listing id so a listing that
|
||||
reappears across overlapping outcode searches is fetched at most once. Plain
|
||||
HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network /
|
||||
parse errors degrade gracefully to None so the caller falls back to the
|
||||
coordinate-nearest postcode.
|
||||
"""
|
||||
if listing_id in _detail_postcode_cache:
|
||||
return _detail_postcode_cache[listing_id]
|
||||
|
||||
full_url = (
|
||||
ONTHEMARKET_BASE + details_url
|
||||
if details_url and not details_url.startswith("http")
|
||||
else details_url
|
||||
)
|
||||
result: str | None = None
|
||||
if full_url:
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(
|
||||
full_url, headers=_HTML_HEADERS, follow_redirects=True
|
||||
)
|
||||
except (
|
||||
httpx.ConnectError,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.PoolTimeout,
|
||||
) as exc:
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
if resp.status_code == 200:
|
||||
result = parse_detail_postcode(resp.text, listing_id)
|
||||
break
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.debug(
|
||||
"OnTheMarket detail %s returned HTTP %d (no postcode)",
|
||||
listing_id, resp.status_code,
|
||||
)
|
||||
break
|
||||
|
||||
_detail_postcode_cache[listing_id] = result
|
||||
return result
|
||||
|
||||
|
||||
def _parse_price(price_value) -> int:
|
||||
"""Parse a formatted price string like '£450,000' into an integer.
|
||||
Returns 0 for POA/auction/null values."""
|
||||
|
|
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:
|
|||
|
||||
|
||||
def transform_property(
|
||||
raw: dict, pc_index: PostcodeSpatialIndex
|
||||
raw: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
detail_postcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw OnTheMarket listing dict into our output schema."""
|
||||
"""Transform a raw OnTheMarket listing dict into our output schema.
|
||||
|
||||
``detail_postcode`` is the property's own full postcode recovered from its
|
||||
detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
|
||||
or ``None`` when no detail fetch was done / no postcode was found. When
|
||||
present and trustworthy (its outcode agrees with the coordinate-nearest
|
||||
postcode) it supersedes the coordinate guess and is labelled
|
||||
``"detail_address"``.
|
||||
"""
|
||||
loc = raw.get("location") or {}
|
||||
raw_lat = loc.get("lat")
|
||||
raw_lng = loc.get("lon")
|
||||
|
|
@ -184,8 +345,29 @@ def transform_property(
|
|||
return None
|
||||
raw_address = raw.get("address", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
# Prefer the property's own detail-page postcode when we have one and it is
|
||||
# trustworthy. The detail postcode is a full unit postcode (better than the
|
||||
# coordinate-nearest guess and than the usually outcode-only card address),
|
||||
# but a stale/mislabelled value would silently override the spatially
|
||||
# correct one, so apply the same outcode-agreement trust rule the address
|
||||
# postcode uses: keep it only when its outcode matches the
|
||||
# coordinate-nearest postcode's outcode.
|
||||
detail_postcode = extract_full_postcode(detail_postcode)
|
||||
if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
|
||||
inferred_postcode
|
||||
):
|
||||
postcode, postcode_source = detail_postcode, "detail_address"
|
||||
else:
|
||||
if detail_postcode:
|
||||
log.debug(
|
||||
"OnTheMarket %s: rejecting detail postcode %s "
|
||||
"(outcode mismatch with inferred %s)",
|
||||
raw.get("id", "?"), detail_postcode, inferred_postcode,
|
||||
)
|
||||
postcode, postcode_source = resolve_listing_postcode(
|
||||
extracted_postcode, inferred_postcode
|
||||
)
|
||||
|
||||
raw_beds = raw.get("bedrooms") or 0
|
||||
raw_baths = raw.get("bathrooms") or 0
|
||||
|
|
@ -223,6 +405,10 @@ def transform_property(
|
|||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
# OnTheMarket search JSON exposes only a street-level address; no UPRN
|
||||
# or house number/name is available without a detail-page fetch.
|
||||
"UPRN": None,
|
||||
"Property number or name": None,
|
||||
"Leasehold/Freehold": _extract_tenure(features),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
@ -242,10 +428,17 @@ def search_outcode(
|
|||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through OnTheMarket sale results for one outcode."""
|
||||
"""Paginate through OnTheMarket sale results for one outcode.
|
||||
|
||||
When ``OTM_FETCH_DETAILS`` is enabled, up to
|
||||
``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
|
||||
fetched for the property's own postcode (see ``_fetch_detail_postcode``);
|
||||
the rest fall back to the coordinate-nearest postcode.
|
||||
"""
|
||||
properties: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
page_num = 1
|
||||
details_fetched = 0
|
||||
|
||||
while True:
|
||||
data = _fetch_page_json(client, outcode, page_num)
|
||||
|
|
@ -269,8 +462,22 @@ def search_outcode(
|
|||
if listing_id and listing_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(listing_id)
|
||||
|
||||
detail_postcode = None
|
||||
if OTM_FETCH_DETAILS and listing_id:
|
||||
# Cached lookups are free; only fresh GETs count toward the cap
|
||||
# and incur the inter-request delay.
|
||||
cached = listing_id in _detail_postcode_cache
|
||||
if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
|
||||
detail_postcode = _fetch_detail_postcode(
|
||||
client, raw.get("details-url") or "", listing_id
|
||||
)
|
||||
if not cached:
|
||||
details_fetched += 1
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
try:
|
||||
transformed = transform_property(raw, pc_index)
|
||||
transformed = transform_property(raw, pc_index, detail_postcode)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"OnTheMarket %s property %s failed to transform: %s",
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
import httpx
|
||||
|
|
@ -6,12 +8,15 @@ import httpx
|
|||
from constants import (
|
||||
PAGE_SIZE,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
RIGHTMOVE_DETAIL_URL,
|
||||
RIGHTMOVE_FETCH_DETAILS,
|
||||
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||||
SEARCH_URL,
|
||||
TYPEAHEAD_URL,
|
||||
)
|
||||
from http_client import fetch_with_retry
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import transform_property
|
||||
from transform import extract_full_postcode, normalize_postcode, transform_property
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
|
@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
|
|||
_MAX_INDEX = 1008
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detail-page postcode extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# The search API (_paginate) only returns an outcode-level `displayAddress`
|
||||
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
|
||||
# listing's detail page, however, embeds the property's OWN full postcode in a
|
||||
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
|
||||
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
|
||||
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
|
||||
# This is the property's own postcode, NOT a nearest station/school: the
|
||||
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
|
||||
# postcodes, and the address outcode always matches the searched outcode.
|
||||
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
|
||||
# new-builds) found the full postcode present 100% of the time. There is no
|
||||
# UPRN or house-number field anywhere in propertyData, so those stay None.
|
||||
#
|
||||
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
|
||||
# a JSON STRING holding a flat array where every integer inside a container is
|
||||
# an index reference into that same array (so the graph can dedupe). We
|
||||
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
|
||||
# cannot — then rehydrate the reference graph before reading the address.
|
||||
|
||||
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
|
||||
|
||||
|
||||
def _extract_page_model_literal(html: str) -> str | None:
|
||||
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
|
||||
|
||||
Brace-matches with string/escape awareness so embedded braces and quotes in
|
||||
string values don't end the match early. Returns None when absent."""
|
||||
marker = _PAGE_MODEL_RE.search(html)
|
||||
if not marker:
|
||||
return None
|
||||
start = marker.end()
|
||||
if start >= len(html) or html[start] != "{":
|
||||
return None
|
||||
depth = 0
|
||||
in_str = False
|
||||
esc = False
|
||||
for j in range(start, len(html)):
|
||||
ch = html[j]
|
||||
if in_str:
|
||||
if esc:
|
||||
esc = False
|
||||
elif ch == "\\":
|
||||
esc = True
|
||||
elif ch == '"':
|
||||
in_str = False
|
||||
elif ch == '"':
|
||||
in_str = True
|
||||
elif ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return html[start : j + 1]
|
||||
return None
|
||||
|
||||
|
||||
def _rehydrate(flat: list) -> object:
|
||||
"""Resolve a devalue-style flattened reference array into a nested object.
|
||||
|
||||
Index 0 is the root; every int inside a dict/list is an index back into
|
||||
``flat``. Memoised so shared/cyclic references resolve once."""
|
||||
cache: dict[int, object] = {}
|
||||
|
||||
def resolve(idx: int) -> object:
|
||||
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
|
||||
return None
|
||||
if idx in cache:
|
||||
return cache[idx]
|
||||
node = flat[idx]
|
||||
if isinstance(node, dict):
|
||||
out: dict = {}
|
||||
cache[idx] = out
|
||||
for key, value in node.items():
|
||||
out[key] = resolve(value) if isinstance(value, int) else value
|
||||
return out
|
||||
if isinstance(node, list):
|
||||
arr: list = []
|
||||
cache[idx] = arr
|
||||
for value in node:
|
||||
arr.append(resolve(value) if isinstance(value, int) else value)
|
||||
return arr
|
||||
cache[idx] = node
|
||||
return node
|
||||
|
||||
return resolve(0)
|
||||
|
||||
|
||||
def parse_detail_postcode(html: str) -> str | None:
|
||||
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
|
||||
|
||||
Pure and network-free so it is unit-testable: callers pass the page HTML.
|
||||
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
|
||||
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
|
||||
page has no parseable address (the property location wrapper can be empty —
|
||||
the caller then keeps the coordinate fallback). The returned outcode is
|
||||
re-validated against the joined postcode so a malformed incode is dropped.
|
||||
"""
|
||||
if not html:
|
||||
return None
|
||||
literal = _extract_page_model_literal(html)
|
||||
if not literal:
|
||||
return None
|
||||
try:
|
||||
outer = json.loads(literal)
|
||||
flat = json.loads(outer["data"])
|
||||
except (ValueError, KeyError, TypeError):
|
||||
return None
|
||||
if not isinstance(flat, list) or not flat:
|
||||
return None
|
||||
|
||||
root = _rehydrate(flat)
|
||||
if not isinstance(root, dict):
|
||||
return None
|
||||
property_data = root.get("propertyData")
|
||||
if not isinstance(property_data, dict):
|
||||
return None
|
||||
address = property_data.get("address")
|
||||
if not isinstance(address, dict):
|
||||
return None
|
||||
|
||||
outcode = address.get("outcode")
|
||||
incode = address.get("incode")
|
||||
if not isinstance(outcode, str) or not isinstance(incode, str):
|
||||
return None
|
||||
outcode, incode = outcode.strip(), incode.strip()
|
||||
if not outcode or not incode:
|
||||
return None
|
||||
|
||||
# Round-trip through the shared postcode validator/normaliser: this both
|
||||
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
|
||||
# a structurally-valid UK postcode.
|
||||
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
|
||||
|
||||
|
||||
# listingId -> true full postcode (or None when unavailable). Failures are
|
||||
# cached too, so a broken/duplicate listing is fetched at most once per run (the
|
||||
# same listing can reappear across overlapping outcode searches).
|
||||
_detail_postcode_cache: dict[str, str | None] = {}
|
||||
|
||||
|
||||
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
|
||||
"""GET a listing detail page and return its true full postcode (or None).
|
||||
|
||||
Results (including failures) are cached by listing id. The detail page is a
|
||||
plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
|
||||
suffices; any error degrades gracefully to the coordinate fallback."""
|
||||
if not property_id:
|
||||
return None
|
||||
if property_id in _detail_postcode_cache:
|
||||
return _detail_postcode_cache[property_id]
|
||||
|
||||
postcode: str | None = None
|
||||
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
|
||||
try:
|
||||
resp = client.get(url, headers={"Accept": "text/html"})
|
||||
if resp.status_code == 200:
|
||||
postcode = parse_detail_postcode(resp.text)
|
||||
else:
|
||||
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
|
||||
except httpx.HTTPError as exc:
|
||||
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
|
||||
|
||||
_detail_postcode_cache[property_id] = postcode
|
||||
return postcode
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
if outcode in outcode_cache:
|
||||
|
|
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
def _detail_postcode_for(
|
||||
client: httpx.Client,
|
||||
prop: dict,
|
||||
fetch_details: bool,
|
||||
detail_budget: dict,
|
||||
) -> str | None:
|
||||
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
|
||||
|
||||
Cached listings are always served (they cost neither a cap slot nor a GET);
|
||||
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
|
||||
if not fetch_details:
|
||||
return None
|
||||
property_id = str(prop.get("id") or "")
|
||||
if not property_id:
|
||||
return None
|
||||
if property_id in _detail_postcode_cache:
|
||||
return _detail_postcode_cache[property_id]
|
||||
if detail_budget["remaining"] <= 0:
|
||||
return None
|
||||
detail_budget["remaining"] -= 1
|
||||
postcode = _fetch_detail_postcode(client, property_id)
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
return postcode
|
||||
|
||||
|
||||
def _paginate(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
|
|
@ -51,11 +251,19 @@ def _paginate(
|
|||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
fetch_details: bool = False,
|
||||
detail_cap: int = 0,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Paginate through search results. Returns (properties, result_count)."""
|
||||
"""Paginate through search results. Returns (properties, result_count).
|
||||
|
||||
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
|
||||
their detail page fetched for the property's TRUE full postcode (see
|
||||
``parse_detail_postcode``); the rest fall back to coordinate-derived
|
||||
postcodes."""
|
||||
properties = []
|
||||
index = 0
|
||||
result_count = 0
|
||||
detail_budget = {"remaining": detail_cap}
|
||||
|
||||
while True:
|
||||
params = {
|
||||
|
|
@ -82,7 +290,12 @@ def _paginate(
|
|||
|
||||
for prop in raw_props:
|
||||
try:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
detail_postcode = _detail_postcode_for(
|
||||
client, prop, fetch_details, detail_budget
|
||||
)
|
||||
transformed = transform_property(
|
||||
prop, outcode, pc_index, detail_postcode=detail_postcode
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Rightmove %s/%s property %s failed to transform: %s",
|
||||
|
|
@ -127,7 +340,12 @@ def search_outcode(
|
|||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through unfiltered sale results for one outcode+channel."""
|
||||
"""Paginate through unfiltered sale results for one outcode+channel.
|
||||
|
||||
Each listing's detail page is fetched for the property's TRUE full postcode
|
||||
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
|
||||
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
|
||||
coordinate-derived postcode."""
|
||||
properties, _ = _paginate(
|
||||
client,
|
||||
outcode_id,
|
||||
|
|
@ -135,6 +353,8 @@ def search_outcode(
|
|||
channel_cfg,
|
||||
pc_index,
|
||||
max_properties=max_properties,
|
||||
fetch_details=RIGHTMOVE_FETCH_DETAILS,
|
||||
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||||
)
|
||||
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
|
|
|
|||
|
|
@ -15,6 +15,10 @@ from constants import (
|
|||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
LONDON_OUTCODE_PREFIXES,
|
||||
ZOOPLA_DETAIL_BUDGET_FRACTION,
|
||||
ZOOPLA_FETCH_DETAILS,
|
||||
ZOOPLA_FETCHER,
|
||||
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||
)
|
||||
|
||||
from http_client import make_client
|
||||
|
|
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
|
|||
return timeout
|
||||
|
||||
|
||||
def _zoopla_detail_cap() -> int:
|
||||
"""Max detail-page fetches per outcode (0 disables detail fetching).
|
||||
|
||||
Zoopla search cards only expose an outcode-level address, so the full
|
||||
postcode/coordinates come from each listing's detail page. The cap bounds
|
||||
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
|
||||
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
|
||||
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
|
||||
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
|
||||
|
||||
|
||||
def _open_zoopla_detail_tab(page, detail_cap: int):
|
||||
"""Open a second tab on the same context for detail-page fetches.
|
||||
|
||||
Sharing the persistent context means the detail tab inherits the search
|
||||
tab's Cloudflare clearance cookies. Returns None when detail fetching is
|
||||
disabled or the tab cannot be created (the scrape then degrades to
|
||||
outcode-level postcodes rather than failing)."""
|
||||
if detail_cap <= 0:
|
||||
return None
|
||||
try:
|
||||
return page.context.new_page()
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
|
||||
_exception_detail(exc),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _wall_clock_timeout(seconds: int, label: str):
|
||||
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
||||
|
|
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
|
|||
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
||||
|
||||
|
||||
def _scrape_zoopla_flaresolverr(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
results: dict[str, list[dict]],
|
||||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
|
||||
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
||||
from zoopla_flaresolverr import search_outcode as fs_search_outcode
|
||||
|
||||
try:
|
||||
session = FlareSolverrSession(session="zoopla")
|
||||
session.__enter__()
|
||||
except FlareSolverrError as exc:
|
||||
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
|
||||
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
|
||||
return
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
|
||||
if remaining == 0:
|
||||
log.info("Zoopla cap reached")
|
||||
return
|
||||
try:
|
||||
props, _ = fs_search_outcode(
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
session,
|
||||
max_properties=remaining,
|
||||
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||
)
|
||||
added = _store_properties(results, "zoopla", props, max_properties_per_source)
|
||||
log.info("Zoopla %s: +%d", outcode, added)
|
||||
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
|
||||
_record_error(errors, "zoopla", outcode, exc)
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
session.__exit__(None, None, None)
|
||||
|
||||
|
||||
def _scrape_zoopla(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -446,6 +524,12 @@ def _scrape_zoopla(
|
|||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
if ZOOPLA_FETCHER == "flaresolverr":
|
||||
_scrape_zoopla_flaresolverr(
|
||||
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
browser, page = _launch_zoopla_with_retries()
|
||||
except Exception as exc:
|
||||
|
|
@ -454,6 +538,12 @@ def _scrape_zoopla(
|
|||
return
|
||||
|
||||
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
||||
detail_cap = _zoopla_detail_cap()
|
||||
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||
# Spend at most a fraction of each outcode's budget on detail fetches so the
|
||||
# SIGALRM guard never trips mid-outcode and discards already-collected
|
||||
# search listings; the rest is left for search pagination and transform.
|
||||
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
|
|
@ -470,6 +560,9 @@ def _scrape_zoopla(
|
|||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=None,
|
||||
detail_page=detail_page,
|
||||
detail_cap=detail_cap,
|
||||
detail_budget_seconds=detail_budget_seconds,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -496,6 +589,8 @@ def _scrape_zoopla(
|
|||
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
||||
try:
|
||||
browser, page = _launch_zoopla_with_retries()
|
||||
# The old context (and its detail tab) is gone; reopen one.
|
||||
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||
log.info("Zoopla %s retrying with fresh browser", outcode)
|
||||
except Exception as relaunch_exc:
|
||||
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
||||
|
|
@ -503,6 +598,11 @@ def _scrape_zoopla(
|
|||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
if detail_page is not None:
|
||||
try:
|
||||
detail_page.close()
|
||||
except Exception:
|
||||
pass
|
||||
_close_zoopla_browser(browser, "zoopla final")
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
|||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
# UPRN (when the scraper recovered it) keys an exact listing->EPC
|
||||
# join; Property number or name is the house identifier for the
|
||||
# Price-Paid address join. Both are None for sources/listings without
|
||||
# a detail-page fetch.
|
||||
"UPRN": [p.get("UPRN") for p in properties],
|
||||
"Property number or name": [
|
||||
p.get("Property number or name") for p in properties
|
||||
],
|
||||
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
|
||||
"Property type": [p["Property type"] for p in properties],
|
||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||
|
|
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
|||
"Inferred postcode": pl.Utf8,
|
||||
"Listing raw address": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"UPRN": pl.Utf8,
|
||||
"Property number or name": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
"Property sub-type": pl.Utf8,
|
||||
|
|
|
|||
206
finder/test_onthemarket.py
Normal file
206
finder/test_onthemarket.py
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
|
||||
|
||||
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
|
||||
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
|
||||
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
|
||||
the property's own postcode lives in the analytics dataLayer
|
||||
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
|
||||
postcode sits separately under `…property.agent.postcode` — the trap we must not
|
||||
fall into.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import onthemarket
|
||||
from onthemarket import parse_detail_postcode, transform_property
|
||||
|
||||
|
||||
class _StubIndex:
|
||||
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
|
||||
|
||||
def __init__(self, postcode: str | None):
|
||||
self._postcode = postcode
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str | None:
|
||||
return self._postcode
|
||||
|
||||
|
||||
def _detail_html(
|
||||
*,
|
||||
property_id: int = 19522441,
|
||||
datalayer_postcode: str = "SE5 9AA",
|
||||
agent_postcode: str = "SE5 8RS",
|
||||
) -> str:
|
||||
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
|
||||
next_data = {
|
||||
"props": {
|
||||
"initialReduxState": {
|
||||
"metadata": {
|
||||
"dataLayer": {
|
||||
"page-type": "details-section",
|
||||
"property-type": "homes",
|
||||
# The property's own unit postcode.
|
||||
"postcode": datalayer_postcode,
|
||||
"property-id": property_id,
|
||||
"price": "275,000",
|
||||
"addressline_2": "Padfield Road",
|
||||
}
|
||||
},
|
||||
"property": {
|
||||
"displayAddress": "Padfield Road, London, SE5",
|
||||
"location": {"lon": -0.100233, "lat": 51.466129},
|
||||
# The agent block carries the AGENT'S office postcode — the
|
||||
# trap. parse_detail_postcode must not return this.
|
||||
"agent": {
|
||||
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
|
||||
"postcode": agent_postcode,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
payload = json.dumps(next_data)
|
||||
return (
|
||||
"<html><body>"
|
||||
'<script id="__NEXT_DATA__" type="application/json">'
|
||||
f"{payload}"
|
||||
"</script></body></html>"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_detail_postcode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_parse_returns_property_postcode_not_agent():
|
||||
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
|
||||
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
||||
|
||||
|
||||
def test_parse_normalizes_spacing():
|
||||
html = _detail_html(datalayer_postcode="se59aa")
|
||||
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
||||
|
||||
|
||||
def test_parse_ignores_mismatched_property_id():
|
||||
# dataLayer postcode belongs to property 19522441; asking for a different
|
||||
# listing id must refuse to return it.
|
||||
html = _detail_html(property_id=19522441)
|
||||
assert parse_detail_postcode(html, "99999999") is None
|
||||
|
||||
|
||||
def test_parse_accepts_when_no_listing_id_given():
|
||||
html = _detail_html(datalayer_postcode="SE5 9AA")
|
||||
assert parse_detail_postcode(html, None) == "SE5 9AA"
|
||||
|
||||
|
||||
def test_parse_handles_missing_postcode():
|
||||
html = _detail_html(datalayer_postcode="")
|
||||
assert parse_detail_postcode(html, "19522441") is None
|
||||
|
||||
|
||||
def test_parse_handles_no_next_data():
|
||||
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
|
||||
|
||||
|
||||
def test_parse_handles_empty_html():
|
||||
assert parse_detail_postcode("", "1") is None
|
||||
|
||||
|
||||
def test_parse_handles_malformed_json():
|
||||
html = (
|
||||
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
|
||||
)
|
||||
assert parse_detail_postcode(html, "1") is None
|
||||
|
||||
|
||||
def test_parse_handles_missing_datalayer():
|
||||
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
|
||||
html = (
|
||||
'<script id="__NEXT_DATA__" type="application/json">'
|
||||
f"{json.dumps(next_data)}</script>"
|
||||
)
|
||||
assert parse_detail_postcode(html, "1") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# transform_property — detail postcode wiring + trust rule
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_RAW_LISTING = {
|
||||
"id": "19522441",
|
||||
"address": "Padfield Road, London, SE5",
|
||||
"location": {"lon": -0.100233, "lat": 51.466129},
|
||||
"bedrooms": 2,
|
||||
"bathrooms": 1,
|
||||
"price": "£275,000",
|
||||
"humanised-property-type": "Apartment",
|
||||
"features": ["Tenure: Leasehold (99 years remaining)"],
|
||||
"details-url": "/details/19522441/",
|
||||
}
|
||||
|
||||
|
||||
def test_transform_uses_trusted_detail_postcode():
|
||||
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
|
||||
# the (more precise) detail postcode and label it detail_address.
|
||||
index = _StubIndex("SE5 1AA")
|
||||
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
|
||||
assert out is not None
|
||||
assert out["Postcode"] == "SE5 9AA"
|
||||
assert out["Postcode source"] == "detail_address"
|
||||
|
||||
|
||||
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
|
||||
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
|
||||
# outcode -> reject the detail postcode, fall back to coordinate logic.
|
||||
index = _StubIndex("SE5 1AA")
|
||||
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
|
||||
assert out is not None
|
||||
assert out["Postcode"] == "SE5 1AA"
|
||||
assert out["Postcode source"] == "coordinates"
|
||||
|
||||
|
||||
def test_transform_without_detail_postcode_uses_coordinates():
|
||||
index = _StubIndex("SE5 1AA")
|
||||
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
|
||||
assert out is not None
|
||||
assert out["Postcode"] == "SE5 1AA"
|
||||
assert out["Postcode source"] == "coordinates"
|
||||
# No UPRN / house number is recoverable from OnTheMarket.
|
||||
assert out["UPRN"] is None
|
||||
assert out["Property number or name"] is None
|
||||
|
||||
|
||||
def test_transform_detail_postcode_via_search_address_outcode():
|
||||
# When the card address already carries a full postcode that agrees with the
|
||||
# coordinates, the existing "address" source still wins absent a detail
|
||||
# postcode — detail recovery never regresses that path.
|
||||
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
|
||||
index = _StubIndex("SE5 1AA")
|
||||
out = transform_property(raw, index, detail_postcode=None)
|
||||
assert out["Postcode"] == "SE5 1AA"
|
||||
assert out["Postcode source"] == "address"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _fetch_detail_postcode caching (no real network)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_fetch_detail_postcode_is_cached(monkeypatch):
|
||||
onthemarket._detail_postcode_cache.clear()
|
||||
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
|
||||
|
||||
def _boom(*args, **kwargs): # pragma: no cover - must never be called
|
||||
raise AssertionError("network was hit despite a cached value")
|
||||
|
||||
# Any httpx use would explode; the cache hit must short-circuit first.
|
||||
result = onthemarket._fetch_detail_postcode(
|
||||
client=type("C", (), {"get": _boom})(),
|
||||
details_url="/details/19522441/",
|
||||
listing_id="19522441",
|
||||
)
|
||||
assert result == "SE5 9AA"
|
||||
onthemarket._detail_postcode_cache.clear()
|
||||
113
finder/test_rightmove.py
Normal file
113
finder/test_rightmove.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
"""Tests for the Rightmove detail-page postcode extractor.
|
||||
|
||||
The search API only returns an outcode-level ``displayAddress``; the property's
|
||||
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
|
||||
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
|
||||
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
|
||||
graph whose ``data`` field is a JSON STRING of a flat array where every integer
|
||||
inside a container is an index reference into that same array.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from rightmove import _extract_page_model_literal, parse_detail_postcode
|
||||
|
||||
|
||||
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
|
||||
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
|
||||
|
||||
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
|
||||
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
|
||||
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
|
||||
return (
|
||||
"<html><head></head><body>\n"
|
||||
"<script>\n"
|
||||
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
|
||||
"</script>\n"
|
||||
"</body></html>"
|
||||
)
|
||||
|
||||
|
||||
# A faithful slice of a real listing: root -> propertyData -> address, with a
|
||||
# decoy nearestStations array (which carries NO postcodes on the live page) to
|
||||
# prove the parser anchors on the property's own address, not a nearby POI.
|
||||
_FLAT_SW9 = [
|
||||
{"propertyData": 1}, # 0: root
|
||||
{
|
||||
"id": "89089584",
|
||||
"address": 2,
|
||||
"location": 4,
|
||||
"nearestStations": 6,
|
||||
}, # 1: propertyData
|
||||
{
|
||||
"displayAddress": "Caldwell Street, Stockwell",
|
||||
"countryCode": "GB",
|
||||
"ukCountry": "England",
|
||||
"outcode": "SW9",
|
||||
"incode": "0HD",
|
||||
}, # 2: address
|
||||
None, # 3: filler
|
||||
{
|
||||
"latitude": 51.477238,
|
||||
"longitude": -0.116819,
|
||||
"pinType": "ACCURATE_POINT",
|
||||
}, # 4: location
|
||||
None, # 5: filler
|
||||
[7, 8], # 6: nearestStations (references)
|
||||
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
|
||||
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
|
||||
]
|
||||
|
||||
|
||||
def test_parses_full_postcode_from_outcode_and_incode() -> None:
|
||||
html = _page_model_html(_FLAT_SW9)
|
||||
assert parse_detail_postcode(html) == "SW9 0HD"
|
||||
|
||||
|
||||
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
|
||||
# The literal must include the whole nested object, not stop at the first
|
||||
# closing brace inside the escaped data string.
|
||||
html = _page_model_html(_FLAT_SW9)
|
||||
literal = _extract_page_model_literal(html)
|
||||
assert literal is not None
|
||||
assert literal.startswith("{") and literal.endswith("}")
|
||||
# Round-trips back to a dict with the expected top-level keys.
|
||||
assert set(json.loads(literal)) == {"data", "encoding"}
|
||||
|
||||
|
||||
def test_normalises_unspaced_incode() -> None:
|
||||
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
|
||||
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
|
||||
|
||||
|
||||
def test_returns_none_when_address_missing() -> None:
|
||||
# The location wrapper can be empty/absent on some listings; the caller then
|
||||
# keeps the coordinate fallback, so we must return None (not raise).
|
||||
flat = [
|
||||
{"propertyData": 1},
|
||||
{"id": "1", "location": 2},
|
||||
{"latitude": 51.5, "longitude": -0.1},
|
||||
]
|
||||
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||
|
||||
|
||||
def test_returns_none_when_incode_blank() -> None:
|
||||
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||
flat[2] = {**_FLAT_SW9[2], "incode": ""}
|
||||
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||
|
||||
|
||||
def test_returns_none_for_non_postcode_pair() -> None:
|
||||
# A structurally-invalid outcode/incode pair is rejected by the validator.
|
||||
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
|
||||
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||
|
||||
|
||||
def test_returns_none_without_page_model() -> None:
|
||||
assert parse_detail_postcode("") is None
|
||||
assert parse_detail_postcode("<html><body>no model</body></html>") is None
|
||||
# Malformed JSON in the data field degrades gracefully.
|
||||
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
|
||||
assert parse_detail_postcode(broken) is None
|
||||
|
|
@ -1,13 +1,19 @@
|
|||
from transform import (
|
||||
build_register_address,
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
extract_outcode,
|
||||
resolve_listing_postcode,
|
||||
transform_property,
|
||||
)
|
||||
|
||||
|
||||
class StubPostcodeIndex:
|
||||
def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
|
||||
self._postcode = postcode
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str:
|
||||
return "SW1A 9ZZ"
|
||||
return self._postcode
|
||||
|
||||
|
||||
def test_extract_full_postcode_normalizes_spacing() -> None:
|
||||
|
|
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
|
|||
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
|
||||
|
||||
|
||||
def test_build_register_address_prepends_house_number_or_name() -> None:
|
||||
# House number/name prepended, with the trailing outcode/postcode stripped.
|
||||
assert (
|
||||
build_register_address("South Street, Bromley BR1", "12")
|
||||
== "12, South Street, Bromley"
|
||||
)
|
||||
assert (
|
||||
build_register_address("Riverside, Martham NR29", "Martham Mill")
|
||||
== "Martham Mill, Riverside, Martham"
|
||||
)
|
||||
# No number/name -> identical to the plain cleaned address.
|
||||
assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
|
||||
# Already starts with the number/name -> no duplication.
|
||||
assert (
|
||||
build_register_address("12 South Street, Bromley", "12")
|
||||
== "12 South Street, Bromley"
|
||||
)
|
||||
# Empty/whitespace number/name is ignored.
|
||||
assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley"
|
||||
|
||||
|
||||
def test_extract_outcode() -> None:
|
||||
assert extract_outcode("SW1A 2AA") == "SW1A"
|
||||
assert extract_outcode("n4 2ha") == "N4"
|
||||
assert extract_outcode("SW1A2AA") == "SW1A"
|
||||
assert extract_outcode(None) is None
|
||||
assert extract_outcode("") is None
|
||||
|
||||
|
||||
def test_resolve_listing_postcode() -> None:
|
||||
# Outcode matches -> trust the more precise extracted postcode.
|
||||
assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
|
||||
# Outcode mismatch -> fall back to the spatially-correct inferred postcode.
|
||||
assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
# Well-formed but fabricated postcode in a different outcode is rejected.
|
||||
assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
# No extracted postcode -> inferred is authoritative.
|
||||
assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
|
||||
|
||||
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
||||
prop = {
|
||||
"id": "123",
|
||||
|
|
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
|||
assert result["Inferred postcode"] == "SW1A 9ZZ"
|
||||
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
|
||||
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
|
||||
|
||||
|
||||
def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
|
||||
prop = {
|
||||
"id": "124",
|
||||
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||
"price": {"amount": 750000, "displayPrices": []},
|
||||
"propertySubType": "Terraced",
|
||||
"bedrooms": 3,
|
||||
"bathrooms": 1,
|
||||
"keyFeatures": [],
|
||||
"propertyUrl": "/properties/124",
|
||||
# Address postcode is in a different outcode than the coordinate-nearest one.
|
||||
"displayAddress": "10 Downing Street, E14 9SS",
|
||||
}
|
||||
|
||||
result = transform_property(prop, "SW1A", StubPostcodeIndex())
|
||||
|
||||
assert result is not None
|
||||
# The spatially-correct inferred postcode wins over the mismatching extracted one.
|
||||
assert result["Postcode"] == "SW1A 9ZZ"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
assert result["Extracted postcode"] == "E14 9SS"
|
||||
|
||||
|
||||
def _rightmove_prop() -> dict:
|
||||
return {
|
||||
"id": "200",
|
||||
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||
"price": {"amount": 750000, "displayPrices": []},
|
||||
"propertySubType": "Terraced",
|
||||
"bedrooms": 3,
|
||||
"bathrooms": 1,
|
||||
"keyFeatures": [],
|
||||
"propertyUrl": "/properties/200",
|
||||
# Search API only ever exposes the outcode in the display address.
|
||||
"displayAddress": "Caldwell Street, Stockwell, SW9",
|
||||
}
|
||||
|
||||
|
||||
def test_rightmove_transform_prefers_detail_postcode() -> None:
|
||||
# The detail page's true full postcode (same outcode as the location) is
|
||||
# preferred over the coordinate-nearest guess.
|
||||
result = transform_property(
|
||||
_rightmove_prop(),
|
||||
"SW9",
|
||||
StubPostcodeIndex("SW9 7AA"),
|
||||
detail_postcode="SW9 0HD",
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 0HD"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
# The coordinate inference is still surfaced separately.
|
||||
assert result["Inferred postcode"] == "SW9 7AA"
|
||||
|
||||
|
||||
def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
|
||||
# A detail postcode whose outcode disagrees with the location must not
|
||||
# relocate the listing; the coordinate postcode wins instead.
|
||||
result = transform_property(
|
||||
_rightmove_prop(),
|
||||
"SW9",
|
||||
StubPostcodeIndex("SW9 7AA"),
|
||||
detail_postcode="E14 9SS",
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 7AA"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
|
||||
|
||||
def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
|
||||
# No detail postcode -> behaviour is unchanged (coordinate-nearest).
|
||||
result = transform_property(
|
||||
_rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 7AA"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
|
|
|
|||
288
finder/test_zoopla.py
Normal file
288
finder/test_zoopla.py
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
|
||||
|
||||
|
||||
def test_detail_cache_key_uses_listing_id() -> None:
|
||||
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
|
||||
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
|
||||
# No id in the URL -> fall back to the URL itself as the key.
|
||||
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
|
||||
|
||||
|
||||
class StubPostcodeIndex:
|
||||
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
|
||||
|
||||
def __init__(self, postcode: str = "BR1 2AB") -> None:
|
||||
self._postcode = postcode
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str:
|
||||
return self._postcode
|
||||
|
||||
|
||||
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
|
||||
# verified detail-page snippet (well inside the England bounds check).
|
||||
PC_COORDS = {
|
||||
"BR1 2AB": (51.40, 0.01),
|
||||
"SW1A 1AA": (51.50, -0.14),
|
||||
"NR29 4RG": (52.716014, 1.614495),
|
||||
}
|
||||
|
||||
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
|
||||
# a self.__next_f flight chunk in page.content().
|
||||
_LOCATION_ESCAPED = (
|
||||
'<script>self.__next_f.push([1,"...'
|
||||
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
|
||||
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
|
||||
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
|
||||
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
|
||||
'..."])</script>'
|
||||
)
|
||||
|
||||
|
||||
def test_parse_detail_geo_location_object_escaped() -> None:
|
||||
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
|
||||
assert geo == {
|
||||
"lat": 52.716014,
|
||||
"lng": 1.614495,
|
||||
"postcode": "NR29 4RG",
|
||||
"outcode": "NR29",
|
||||
"source": "detail_location",
|
||||
"uprn": "10023461458",
|
||||
"number_or_name": "Martham Mill",
|
||||
# No `address` twin in this snippet, so there is no full street address.
|
||||
"full_address": None,
|
||||
}
|
||||
|
||||
|
||||
def test_parse_detail_geo_location_object_unescaped() -> None:
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
assert geo["postcode"] == "NR29 4RG"
|
||||
|
||||
|
||||
def test_parse_detail_geo_address_twin() -> None:
|
||||
html = (
|
||||
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||
'"latitude":52.716014,"longitude":1.614495,'
|
||||
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_address_obj"
|
||||
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||
|
||||
|
||||
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
|
||||
# Real detail pages carry both wrappers: the `location` object holds the
|
||||
# uprn + house number/name, the `address` twin holds the full street
|
||||
# address. They share a uprn, so the twin's fullAddress is attached.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG",'
|
||||
'"propertyNumberOrName":"Martham Mill"}'
|
||||
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||
'"latitude":52.716014,"longitude":1.614495,'
|
||||
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["number_or_name"] == "Martham Mill"
|
||||
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||
|
||||
|
||||
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
|
||||
# The only `address` twin on the page belongs to a different uprn (a
|
||||
# comparable listing). With a uprn to match on, an unrelated twin is never
|
||||
# borrowed — full_address stays None rather than grabbing the wrong street.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
|
||||
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["full_address"] is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
|
||||
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
|
||||
# by the property's own "location" wrapper. Anchoring on the wrapper means
|
||||
# the POI's coordinates are ignored and the property's are returned.
|
||||
poi = (
|
||||
'"name":"Martham Community Centre","numberOfConnectors":2,'
|
||||
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
|
||||
)
|
||||
prop = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(poi + prop)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
# The property's coords win, not the community centre's.
|
||||
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
|
||||
assert geo["postcode"] == "NR29 4RG"
|
||||
|
||||
|
||||
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
|
||||
# Page embeds two location objects (e.g. a comparable then the property).
|
||||
# With a search outcode, the one in that outcode is preferred; without one,
|
||||
# the first (document order = primary listing) is returned.
|
||||
comparable = (
|
||||
'"location":{"outcode":"EN2",'
|
||||
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
|
||||
'"postalCode":"EN2 6SN"}'
|
||||
)
|
||||
target = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
|
||||
assert geo is not None and geo["postcode"] == "NR29 4RG"
|
||||
geo_first = parse_detail_geo(comparable + target)
|
||||
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
|
||||
|
||||
|
||||
def test_parse_detail_geo_rejects_out_of_england() -> None:
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":10.0,"longitude":10.0},'
|
||||
'"uprn":"1","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
assert parse_detail_geo(html) is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
|
||||
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
|
||||
# keep the coordinates, drop the untrustworthy postcode.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"1","postalCode":"AB12 3CD"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["lat"] == 52.716014
|
||||
assert geo["postcode"] is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_returns_none_for_garbage() -> None:
|
||||
assert parse_detail_geo("<html><body>no data here</body></html>") is None
|
||||
assert parse_detail_geo("") is None
|
||||
# Coordinates that are not inside a property location/address wrapper (e.g.
|
||||
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
|
||||
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
|
||||
|
||||
|
||||
def _raw(**overrides) -> dict:
|
||||
raw = {
|
||||
"id": "123",
|
||||
"url": "/for-sale/details/123/",
|
||||
"address": "South Street, Bromley BR1",
|
||||
"price": 500000,
|
||||
"beds": 2,
|
||||
"baths": 1,
|
||||
"property_type": "Flat",
|
||||
}
|
||||
raw.update(overrides)
|
||||
return raw
|
||||
|
||||
|
||||
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
|
||||
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
|
||||
assert result["Postcode"] == "BR1 3CD"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
assert result["Inferred postcode"] == "BR1 2AB"
|
||||
assert (result["lat"], result["lon"]) == (51.401, 0.011)
|
||||
|
||||
|
||||
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
|
||||
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Mismatching detail postcode is rejected in favour of the spatial value.
|
||||
assert result["Postcode"] == "BR1 2AB"
|
||||
assert result["Postcode source"] == "detail_coordinates"
|
||||
|
||||
|
||||
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
|
||||
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW1A 1AA"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
|
||||
|
||||
|
||||
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
|
||||
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
|
||||
result = transform_property(
|
||||
_raw(address="A street with no postcode"),
|
||||
StubPostcodeIndex(),
|
||||
PC_COORDS,
|
||||
search_outcode="BR1",
|
||||
detail=None,
|
||||
)
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "BR1 2AB"
|
||||
assert result["Postcode source"] == "search_outcode"
|
||||
# No detail page -> no UPRN / house number recovered.
|
||||
assert result["UPRN"] is None
|
||||
assert result["Property number or name"] is None
|
||||
|
||||
|
||||
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
|
||||
detail = {
|
||||
"lat": 51.401,
|
||||
"lng": 0.011,
|
||||
"postcode": "BR1 3CD",
|
||||
"outcode": "BR1",
|
||||
"uprn": "100023461458",
|
||||
"number_or_name": "12",
|
||||
"full_address": "South Street, Bromley BR1",
|
||||
}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
assert result["UPRN"] == "100023461458"
|
||||
assert result["Property number or name"] == "12"
|
||||
# The detail full address replaces the outcode-level card address, and the
|
||||
# house number is prepended for a near-exact Property Register match.
|
||||
assert result["Listing raw address"] == "South Street, Bromley BR1"
|
||||
assert result["Address per Property Register"] == "12, South Street, Bromley"
|
||||
|
||||
|
||||
def test_transform_ignores_out_of_england_detail_coords() -> None:
|
||||
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Bad detail coords are discarded; falls through to the address outcode (BR1).
|
||||
assert result["Postcode source"] == "address_outcode"
|
||||
assert 49 <= result["lat"] <= 56
|
||||
|
|
@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
|
|||
return normalize_postcode(match.group(1))
|
||||
|
||||
|
||||
def extract_outcode(postcode: str | None) -> str | None:
|
||||
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
|
||||
if not postcode:
|
||||
return None
|
||||
normalized = normalize_postcode(postcode)
|
||||
outcode = normalized.split(" ", 1)[0]
|
||||
return outcode or None
|
||||
|
||||
|
||||
def resolve_listing_postcode(
|
||||
extracted_postcode: str | None, inferred_postcode: str
|
||||
) -> tuple[str, str]:
|
||||
"""Pick the authoritative postcode for a listing, returning (postcode, source).
|
||||
|
||||
The address-extracted postcode is more precise than the coordinate-nearest one,
|
||||
but it is only trustworthy when it agrees with the location: a stale, mistyped or
|
||||
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
|
||||
override the spatially-correct value. Since the spatial index only supports
|
||||
nearest-lookup, accept the extracted postcode only when its outcode matches the
|
||||
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
|
||||
inferred one, which is always a real, plausibly-correct postcode.
|
||||
"""
|
||||
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
|
||||
inferred_postcode
|
||||
):
|
||||
return extracted_postcode, "address"
|
||||
if extracted_postcode:
|
||||
log.debug(
|
||||
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
|
||||
extracted_postcode,
|
||||
inferred_postcode,
|
||||
)
|
||||
return inferred_postcode, "coordinates"
|
||||
|
||||
|
||||
def clean_listing_address(address: str | None) -> str:
|
||||
"""Remove postcode/outcode suffixes from listing display addresses.
|
||||
|
||||
|
|
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
|
|||
return cleaned.strip(" ,")
|
||||
|
||||
|
||||
def build_register_address(
|
||||
raw_address: str | None, number_or_name: str | None = None
|
||||
) -> str:
|
||||
"""Build a Property Register-style address, prepending the house number/name.
|
||||
|
||||
Listing display addresses are usually street-level ("South Street, Bromley")
|
||||
because the portals hide the exact unit. When a scraper can recover the
|
||||
property's own number or name (e.g. Zoopla detail pages expose
|
||||
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
|
||||
carries the house identifier that the EPC/Price-Paid register addresses also
|
||||
use — turning a fuzzy street match into a near-exact one. Falls back to the
|
||||
plain cleaned address when no number/name is available.
|
||||
"""
|
||||
cleaned = clean_listing_address(raw_address)
|
||||
if not number_or_name:
|
||||
return cleaned
|
||||
number_or_name = number_or_name.strip()
|
||||
if not number_or_name:
|
||||
return cleaned
|
||||
# Avoid duplicating a number/name the display address already starts with.
|
||||
if cleaned.lower().startswith(number_or_name.lower()):
|
||||
return cleaned
|
||||
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
prop: dict,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
detail_postcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
"""Transform a raw Rightmove property dict into our output schema.
|
||||
|
||||
``detail_postcode`` is the property's TRUE full postcode recovered from its
|
||||
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
|
||||
only exposes the outcode-level ``displayAddress``. When supplied and it
|
||||
agrees with the coordinate-nearest postcode's outcode, it is preferred over
|
||||
the coordinate guess and recorded with source ``"detail_address"``. A
|
||||
detail postcode whose outcode disagrees with the location is discarded in
|
||||
favour of the spatially-correct coordinate postcode, so a stale or wrong
|
||||
detail value can never silently relocate a listing.
|
||||
"""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
return None
|
||||
|
|
@ -268,8 +341,25 @@ def transform_property(
|
|||
return None
|
||||
raw_address = prop.get("displayAddress", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
# Prefer the detail page's true full postcode when it agrees with the
|
||||
# location; otherwise fall back to the (display-address-or-coordinate) logic.
|
||||
detail_full = extract_full_postcode(detail_postcode)
|
||||
if detail_full and extract_outcode(detail_full) == extract_outcode(
|
||||
inferred_postcode
|
||||
):
|
||||
postcode, postcode_source = detail_full, "detail_address"
|
||||
else:
|
||||
if detail_full:
|
||||
log.debug(
|
||||
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
|
||||
"inferred %s)",
|
||||
detail_full,
|
||||
inferred_postcode,
|
||||
)
|
||||
postcode, postcode_source = resolve_listing_postcode(
|
||||
extracted_postcode, inferred_postcode
|
||||
)
|
||||
|
||||
property_url = prop.get("propertyUrl") or ""
|
||||
if not isinstance(property_url, str):
|
||||
|
|
@ -291,6 +381,9 @@ def transform_property(
|
|||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
# Rightmove's displayAddress is street-level; no UPRN/house number.
|
||||
"UPRN": None,
|
||||
"Property number or name": None,
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
|
|||
381
finder/zoopla.py
381
finder/zoopla.py
|
|
@ -32,16 +32,24 @@ import httpx
|
|||
from constants import (
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
GLUETUN_API_KEY,
|
||||
GLUETUN_CONTROL_URL,
|
||||
GLUETUN_MAX_ROTATIONS,
|
||||
GLUETUN_PROXY,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
ZOOPLA_BASE,
|
||||
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import (
|
||||
clean_listing_address,
|
||||
build_register_address,
|
||||
extract_full_postcode,
|
||||
extract_outcode,
|
||||
fix_coords,
|
||||
normalize_sub_type,
|
||||
parse_int_value,
|
||||
resolve_listing_postcode,
|
||||
validate_floor_area,
|
||||
)
|
||||
|
||||
|
|
@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
|
|||
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
||||
|
||||
|
||||
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
||||
|
||||
|
||||
def _gluetun_base_url() -> str:
|
||||
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
|
||||
return GLUETUN_CONTROL_URL.rstrip("/")
|
||||
|
||||
|
||||
def _gluetun_api_key() -> str | None:
|
||||
return _GLUETUN_API_KEY
|
||||
return GLUETUN_API_KEY
|
||||
|
||||
|
||||
def _gluetun_max_rotations() -> int:
|
||||
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
|
||||
try:
|
||||
value = int(raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
|
||||
return max(value, 0)
|
||||
return max(GLUETUN_MAX_ROTATIONS, 0)
|
||||
|
||||
|
||||
def _gluetun_client() -> httpx.Client:
|
||||
# Talks to the control server directly (not through the VPN proxy).
|
||||
headers = {}
|
||||
api_key = _gluetun_api_key()
|
||||
if api_key:
|
||||
|
|
@ -694,10 +695,19 @@ def launch_browser():
|
|||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
_remove_stale_profile_locks(profile_dir)
|
||||
|
||||
# Route the browser through the Gluetun VPN proxy when configured. (geoip
|
||||
# fingerprint alignment is intentionally not enabled: it needs the optional
|
||||
# camoufox[geoip] extra and would spoof to the VPN exit's country, which
|
||||
# fights the en-GB locale unless the exit is in the UK.)
|
||||
proxy_options: dict = {}
|
||||
if GLUETUN_PROXY:
|
||||
proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
|
||||
|
||||
log.info(
|
||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
|
||||
headless_mode,
|
||||
profile_dir,
|
||||
GLUETUN_PROXY or "direct",
|
||||
)
|
||||
camoufox = Camoufox(
|
||||
headless=headless_mode,
|
||||
|
|
@ -705,6 +715,7 @@ def launch_browser():
|
|||
user_data_dir=str(profile_dir),
|
||||
locale=["en-GB", "en"],
|
||||
enable_cache=True,
|
||||
**proxy_options,
|
||||
)
|
||||
raw_browser = camoufox.__enter__()
|
||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||
|
|
@ -926,13 +937,47 @@ def _paginate(
|
|||
page,
|
||||
total_results: int,
|
||||
max_properties: int | None = None,
|
||||
fetch_detail=None,
|
||||
detail_cap: int = 0,
|
||||
detail_state: dict | None = None,
|
||||
detail_deadline: float | None = None,
|
||||
) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
||||
next link when present, otherwise advance via the pn=N URL parameter while
|
||||
the advertised result count says more listings remain."""
|
||||
the advertised result count says more listings remain.
|
||||
|
||||
When ``fetch_detail`` is supplied, each listing has its detail page fetched
|
||||
(up to ``detail_cap`` fresh loads per outcode, counted in the shared
|
||||
``detail_state`` dict, and only until ``detail_deadline``) and the parsed
|
||||
geo stored under ``listing['_detail']`` for ``transform_property``. The
|
||||
detail page is the only source of the listing's UPRN, full street address
|
||||
and precise postcode, so it is fetched even when the search card already
|
||||
pins a full postcode. Cached detail results are always attached but cost
|
||||
neither a cap slot nor a delay."""
|
||||
|
||||
def _maybe_fetch(listing: dict) -> None:
|
||||
if fetch_detail is None or detail_state is None:
|
||||
return
|
||||
url = listing.get("url", "")
|
||||
cached = _detail_cache_key(url) in _detail_cache
|
||||
if not cached:
|
||||
# Fresh loads are bounded by the per-outcode cap and the wall-clock
|
||||
# deadline so detail fetching never starves the SIGALRM budget that
|
||||
# also guards the search pagination for this outcode.
|
||||
if detail_state["fetched"] >= detail_cap:
|
||||
return
|
||||
if detail_deadline is not None and time.monotonic() >= detail_deadline:
|
||||
return
|
||||
listing["_detail"] = fetch_detail(url)
|
||||
if not cached:
|
||||
detail_state["fetched"] += 1
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
all_listings = _extract_listings(page)
|
||||
for listing in all_listings:
|
||||
_maybe_fetch(listing)
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
|
|
@ -984,6 +1029,7 @@ def _paginate(
|
|||
if listing["id"] not in seen_ids:
|
||||
seen_ids.add(listing["id"])
|
||||
all_listings.append(listing)
|
||||
_maybe_fetch(listing)
|
||||
new_count += 1
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
|
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detail-page geocoding
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Zoopla search result cards only expose an outcode-level display address (e.g.
|
||||
# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
|
||||
# only on each listing's detail page (/for-sale/details/{id}/). The detail page
|
||||
# is a Next.js App Router route whose React Server Components flight stream
|
||||
# embeds the property's own location object, e.g.
|
||||
# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
|
||||
# "uprn":"10023461458","postalCode":"NR29 4RG",...}
|
||||
# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
|
||||
# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
|
||||
# Nearby points of interest (stations, schools, EV chargers) and comparable
|
||||
# listings carry their own "coordinates" too, but never inside the property's
|
||||
# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
|
||||
# loose coordinates object, is what we anchor on (see parse_detail_geo).
|
||||
|
||||
# listingId -> parsed detail dict (or None). Failures are cached too, so a
|
||||
# broken listing is not re-fetched within a run (the same listing reappears
|
||||
# across overlapping outcode searches).
|
||||
_detail_cache: dict[str, dict | None] = {}
|
||||
|
||||
_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
|
||||
|
||||
# The property's own location is carried by a `"location":{...}` wrapper and a
|
||||
# twin `"address":{"fullAddress":...}` widget object. We anchor on those
|
||||
# wrappers (and capture their full object body, which contains exactly one
|
||||
# nested object — `coordinates`) rather than scanning for loose coordinate
|
||||
# objects: nearby points of interest (stations/schools/EV chargers) and
|
||||
# comparable/"similar" listings also embed coordinates, but never inside the
|
||||
# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
|
||||
# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
|
||||
_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
|
||||
_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
|
||||
_DETAIL_COORDS_IN_BODY_RE = re.compile(
|
||||
r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
|
||||
)
|
||||
_DETAIL_LATLNG_IN_BODY_RE = re.compile(
|
||||
r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
|
||||
)
|
||||
_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
|
||||
# The location object spells it "postalCode"; the address twin uses "postcode".
|
||||
_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
|
||||
# The UPRN (Unique Property Reference Number) appears in both the location and
|
||||
# address objects and is the linchpin for an exact listing->EPC join (EPC open
|
||||
# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
|
||||
# (e.g. "12", "Martham Mill") only in the location object.
|
||||
_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
|
||||
_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
|
||||
|
||||
|
||||
def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
|
||||
"""Extract the property's own coordinates/postcode from a Zoopla detail page.
|
||||
|
||||
Pure and browser-free: the live browser only produces the HTML string
|
||||
(``page.content()``); this does the parsing so it is unit-testable.
|
||||
|
||||
Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
|
||||
"number_or_name", "full_address"}`` (every field except the coordinates may
|
||||
be ``None``) or ``None`` when no property location wrapper is found. The
|
||||
``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
|
||||
number/name) and ``full_address`` give a register-style address for the
|
||||
Price Paid join.
|
||||
Coordinates are bounds-checked to England and a postcode is kept only when
|
||||
it agrees with its own object's outcode. ``search_outcode``, when given, is
|
||||
used only as a tie-break to pick the right ``location`` object on pages that
|
||||
also embed comparable listings. See module docstring for the data model."""
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# RSC flight strings are embedded as escaped JS string literals, so quotes
|
||||
# and slashes arrive escaped; normalize them so the regexes match.
|
||||
buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
|
||||
|
||||
def in_england(lat: float, lng: float) -> tuple[float, float] | None:
|
||||
lat, lng = fix_coords(lat, lng)
|
||||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||
return lat, lng
|
||||
return None
|
||||
|
||||
def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
|
||||
# outcode and postcode are read from the SAME object body as the coords,
|
||||
# so the postcode is self-consistent; drop it only if it somehow isn't.
|
||||
outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
|
||||
outcode = outcode_match.group(1) if outcode_match else None
|
||||
postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
|
||||
postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
|
||||
if postcode and outcode and extract_outcode(postcode) != outcode.upper():
|
||||
postcode = None
|
||||
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
|
||||
number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
|
||||
number_or_name = number_match.group(1).strip() if number_match else None
|
||||
return {
|
||||
"lat": coords[0],
|
||||
"lng": coords[1],
|
||||
"postcode": postcode,
|
||||
"outcode": outcode,
|
||||
"source": source,
|
||||
"uprn": uprn_match.group(1) if uprn_match else None,
|
||||
"number_or_name": number_or_name or None,
|
||||
"full_address": full_address,
|
||||
}
|
||||
|
||||
def attach_full_address(result: dict | None) -> dict | None:
|
||||
# The house-numbered street address lives in the `address` map-widget
|
||||
# twin, not the `location` wrapper we anchor coordinates on. Pull it from
|
||||
# the twin that shares this property's uprn; when there is no uprn to
|
||||
# disambiguate, fall back to the first twin (document order = primary
|
||||
# listing), but never guess a twin when a uprn exists and none matches —
|
||||
# that would risk grabbing a comparable listing's address.
|
||||
if result is None or result.get("full_address"):
|
||||
return result
|
||||
target = result.get("uprn")
|
||||
first = None
|
||||
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
||||
full_address = match.group(1) or None
|
||||
if full_address is None:
|
||||
continue
|
||||
if first is None:
|
||||
first = full_address
|
||||
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
|
||||
if target and uprn_match and uprn_match.group(1) == target:
|
||||
result["full_address"] = full_address
|
||||
return result
|
||||
if target is None:
|
||||
result["full_address"] = first
|
||||
return result
|
||||
|
||||
# Strategy 1 — the property's own `location` wrapper (authoritative). Take
|
||||
# the first match (the primary listing precedes any comparables in the
|
||||
# flight stream), but prefer one whose outcode matches the searched outcode.
|
||||
first_location = None
|
||||
for match in _DETAIL_LOCATION_RE.finditer(buf):
|
||||
body = match.group(1)
|
||||
coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
|
||||
if not coords_match:
|
||||
continue
|
||||
coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
|
||||
if not coords:
|
||||
continue
|
||||
candidate = build(body, coords, "detail_location")
|
||||
if first_location is None:
|
||||
first_location = candidate
|
||||
if (
|
||||
search_outcode
|
||||
and candidate["outcode"]
|
||||
and candidate["outcode"].upper() == search_outcode.upper()
|
||||
):
|
||||
return attach_full_address(candidate)
|
||||
if first_location is not None:
|
||||
return attach_full_address(first_location)
|
||||
|
||||
# Strategy 2 — the `address` map-widget twin (same coordinates, backup).
|
||||
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
||||
full_address = match.group(1) or None
|
||||
body = match.group(2)
|
||||
latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
|
||||
if not latlng_match:
|
||||
continue
|
||||
coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
|
||||
if coords:
|
||||
return build(body, coords, "detail_address_obj", full_address=full_address)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _detail_cache_key(listing_url: str) -> str:
|
||||
"""Cache key for a listing detail page — its numeric id when present."""
|
||||
id_match = _LISTING_ID_RE.search(listing_url)
|
||||
return id_match.group(1) if id_match else listing_url
|
||||
|
||||
|
||||
def _fetch_listing_detail(
|
||||
detail_page,
|
||||
listing_url: str,
|
||||
search_outcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Load a listing detail page and return its parsed geo dict (or None).
|
||||
|
||||
Results (including failures) are cached by listingId. Ordinary navigation
|
||||
and extraction errors are swallowed so the caller can fall back to
|
||||
outcode-level resolution, but TurnstileError is allowed to propagate so the
|
||||
scraper's "Cloudflare ends the run" contract still holds. The goto timeout
|
||||
is kept short so one slow detail page can't eat the per-outcode budget."""
|
||||
cache_key = _detail_cache_key(listing_url)
|
||||
if cache_key in _detail_cache:
|
||||
return _detail_cache[cache_key]
|
||||
|
||||
url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
|
||||
result: dict | None = None
|
||||
try:
|
||||
detail_page.goto(
|
||||
url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
|
||||
)
|
||||
_ensure_not_challenged(detail_page)
|
||||
html = detail_page.content()
|
||||
result = parse_detail_geo(html, search_outcode=search_outcode)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
|
||||
result = None
|
||||
|
||||
_detail_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
|
||||
def _map_property_type(raw_type: str | None) -> str:
|
||||
"""Map Zoopla property type text to canonical type."""
|
||||
if not raw_type:
|
||||
|
|
@ -1109,28 +1363,64 @@ def transform_property(
|
|||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
detail: dict | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
Zoopla search cards only expose an outcode-level address, so precise
|
||||
location comes from the listing's detail page (see ``parse_detail_geo`` /
|
||||
``_fetch_listing_detail``), passed in as ``detail``. When detail-page
|
||||
coordinates are available we resolve the nearest postcode via the spatial
|
||||
index — mirroring rightmove/onthemarket — and only fall back to the coarse
|
||||
outcode centroid when no detail location could be obtained."""
|
||||
price = parse_int_value(raw.get("price")) or 0
|
||||
|
||||
address = raw.get("address", "") or ""
|
||||
|
||||
# Resolve postcode and coordinates from address
|
||||
extracted_postcode = extract_full_postcode(address)
|
||||
postcode = extracted_postcode
|
||||
postcode_source = "address" if extracted_postcode else None
|
||||
detail = detail or {}
|
||||
detail_postcode = extract_full_postcode(detail.get("postcode"))
|
||||
# Detail-page address fields: the UPRN keys an exact EPC join, and the
|
||||
# full street address / house number-or-name beat the outcode-level card
|
||||
# address for the Price-Paid join. All three are absent unless the detail
|
||||
# page was fetched, so every consumer must tolerate None.
|
||||
detail_uprn = detail.get("uprn") or None
|
||||
detail_full_address = detail.get("full_address") or None
|
||||
detail_number_or_name = detail.get("number_or_name") or None
|
||||
|
||||
postcode = postcode_source = inferred_postcode = None
|
||||
lat = lng = None
|
||||
|
||||
if postcode:
|
||||
coords = pc_coords.get(postcode)
|
||||
if coords:
|
||||
lat, lng = coords
|
||||
# (A) Best: detail-page coordinates -> nearest postcode (authoritative).
|
||||
detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
|
||||
if detail_lat is not None and detail_lng is not None:
|
||||
fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
|
||||
if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
|
||||
nearest = pc_index.nearest(fixed_lat, fixed_lng)
|
||||
if nearest:
|
||||
lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
|
||||
candidate = detail_postcode or extracted_postcode
|
||||
postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
|
||||
postcode_source = (
|
||||
"detail_address"
|
||||
if resolved_source == "address"
|
||||
else "detail_coordinates"
|
||||
)
|
||||
|
||||
# (B) Detail-page postcode without usable coordinates -> geocode it.
|
||||
if lat is None and detail_postcode and detail_postcode in pc_coords:
|
||||
lat, lng = pc_coords[detail_postcode]
|
||||
postcode = inferred_postcode = detail_postcode
|
||||
postcode_source = "detail_address"
|
||||
|
||||
# (C) Full postcode in the search-card address -> geocode it.
|
||||
if lat is None and extracted_postcode and extracted_postcode in pc_coords:
|
||||
lat, lng = pc_coords[extracted_postcode]
|
||||
postcode = extracted_postcode
|
||||
postcode_source = "address"
|
||||
|
||||
# (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
|
||||
if lat is None:
|
||||
# Try outcode-level fallback from address text
|
||||
addr_outcode = _extract_outcode(address)
|
||||
if addr_outcode:
|
||||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||
|
|
@ -1138,7 +1428,6 @@ def transform_property(
|
|||
postcode, lat, lng = result
|
||||
postcode_source = "address_outcode"
|
||||
|
||||
# Final fallback: use the outcode we know we're searching
|
||||
if lat is None and search_outcode:
|
||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||
if result:
|
||||
|
|
@ -1188,9 +1477,17 @@ def transform_property(
|
|||
"Postcode": postcode,
|
||||
"Postcode source": postcode_source or "unknown",
|
||||
"Extracted postcode": extracted_postcode,
|
||||
"Inferred postcode": postcode if postcode_source != "address" else None,
|
||||
"Listing raw address": address,
|
||||
"Address per Property Register": clean_listing_address(address),
|
||||
"Inferred postcode": (
|
||||
inferred_postcode
|
||||
if inferred_postcode is not None
|
||||
else (postcode if postcode_source != "address" else None)
|
||||
),
|
||||
"Listing raw address": detail_full_address or address,
|
||||
"Address per Property Register": build_register_address(
|
||||
detail_full_address or address, detail_number_or_name
|
||||
),
|
||||
"UPRN": detail_uprn,
|
||||
"Property number or name": detail_number_or_name,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
|
|
@ -1215,6 +1512,9 @@ def search_outcode(
|
|||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
max_properties: int | None = None,
|
||||
detail_page=None,
|
||||
detail_cap: int = 0,
|
||||
detail_budget_seconds: float | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
|
|
@ -1222,6 +1522,12 @@ def search_outcode(
|
|||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
|
||||
are supplied, up to ``detail_cap`` listings per outcode have their detail
|
||||
page fetched for a precise postcode (see ``_fetch_listing_detail``).
|
||||
``detail_budget_seconds`` caps the wall-clock time spent fetching details so
|
||||
the per-outcode timeout that also guards search pagination is never starved.
|
||||
|
||||
Returns (properties, search_url).
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
|
|
@ -1231,12 +1537,25 @@ def search_outcode(
|
|||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
fetch_detail = None
|
||||
detail_deadline = None
|
||||
if detail_page is not None and detail_cap > 0:
|
||||
fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731
|
||||
detail_page, url, search_outcode=outcode
|
||||
)
|
||||
if detail_budget_seconds is not None:
|
||||
detail_deadline = time.monotonic() + detail_budget_seconds
|
||||
|
||||
# Always try extraction even if result count is 0 — the count regex may
|
||||
# not match Zoopla's current text format, but listings may still be in DOM
|
||||
raw_listings = _paginate(
|
||||
page,
|
||||
total_results,
|
||||
max_properties=max_properties,
|
||||
fetch_detail=fetch_detail,
|
||||
detail_cap=detail_cap,
|
||||
detail_state={"fetched": 0},
|
||||
detail_deadline=detail_deadline,
|
||||
)
|
||||
if not raw_listings:
|
||||
if total_results > 0:
|
||||
|
|
@ -1252,7 +1571,11 @@ def search_outcode(
|
|||
for raw in raw_listings:
|
||||
try:
|
||||
transformed = transform_property(
|
||||
raw, pc_index, pc_coords, search_outcode=outcode
|
||||
raw,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
search_outcode=outcode,
|
||||
detail=raw.get("_detail"),
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
|
|
|
|||
164
finder/zoopla_flaresolverr.py
Normal file
164
finder/zoopla_flaresolverr.py
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
|
||||
|
||||
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
|
||||
still contains the React Server Components flight stream — so the existing pure
|
||||
parsers work unchanged:
|
||||
- the search page yields the outcode's listing detail URLs, and
|
||||
- each detail page's flight stream carries the property's location object
|
||||
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
|
||||
listing fields (price/beds/baths/tenure/floor area) parsed here.
|
||||
|
||||
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
|
||||
FlareSolverr session solves the SW9 search + detail pages and the flight data
|
||||
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
|
||||
|
||||
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
|
||||
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
|
||||
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from zoopla import _url_with_page, parse_detail_geo, transform_property
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
# Safety bound on how many search-result pages to walk per outcode.
|
||||
_MAX_SERP_PAGES = 60
|
||||
|
||||
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
|
||||
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
|
||||
|
||||
|
||||
def _int(pattern: str, buf: str) -> int | None:
|
||||
match = re.search(pattern, buf)
|
||||
return int(match.group(1)) if match else None
|
||||
|
||||
|
||||
def parse_detail_listing(html: str) -> dict:
|
||||
"""Extract the non-location listing fields from a Zoopla detail page.
|
||||
|
||||
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
|
||||
detail page's flight stream (validated against real Zoopla detail HTML).
|
||||
All fields are best-effort; missing ones default to None so a listing with
|
||||
a known location is still emitted."""
|
||||
buf = html.replace('\\"', '"').replace("\\/", "/")
|
||||
|
||||
price = _int(r'"internalValue":(\d+)', buf)
|
||||
if price is None:
|
||||
price = _int(r'"priceUnformatted":(\d+)', buf)
|
||||
|
||||
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
|
||||
tenure = tenure_match.group(1).title() if tenure_match else None
|
||||
|
||||
# Address + property type come from the page <title>, e.g.
|
||||
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
|
||||
address = None
|
||||
property_type = None
|
||||
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
|
||||
if title_match:
|
||||
title = title_match.group(1)
|
||||
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
|
||||
if addr_match:
|
||||
address = addr_match.group(1).strip()
|
||||
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
|
||||
if type_match:
|
||||
property_type = type_match.group(1).strip()
|
||||
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
|
||||
if explicit_type:
|
||||
property_type = explicit_type.group(1)
|
||||
|
||||
return {
|
||||
"price": price,
|
||||
"beds": _int(r'"numBedrooms":(\d+)', buf),
|
||||
"baths": _int(r'"numBaths":(\d+)', buf),
|
||||
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
|
||||
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
|
||||
"tenure": tenure,
|
||||
"property_type": property_type,
|
||||
"address": address,
|
||||
}
|
||||
|
||||
|
||||
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
|
||||
"""Walk the outcode's search-result pages and collect listing detail paths."""
|
||||
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
||||
seen: list[str] = []
|
||||
seen_ids: set[str] = set()
|
||||
for page_num in range(1, _MAX_SERP_PAGES + 1):
|
||||
url = base if page_num == 1 else _url_with_page(base, page_num)
|
||||
html = fs.get(url)
|
||||
new = 0
|
||||
for path in _DETAIL_PATH_RE.findall(html):
|
||||
id_match = _LISTING_ID_RE.search(path)
|
||||
listing_id = id_match.group(1) if id_match else path
|
||||
if listing_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(listing_id)
|
||||
seen.append(path)
|
||||
new += 1
|
||||
if limit is not None and len(seen) >= limit:
|
||||
return seen
|
||||
if new == 0:
|
||||
break
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
return seen
|
||||
|
||||
|
||||
def search_outcode(
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
fs: FlareSolverrSession,
|
||||
max_properties: int | None = None,
|
||||
detail_cap: int = 0,
|
||||
detail_budget_seconds: float | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
|
||||
|
||||
Every listing's detail page is fetched (that is where the postcode lives),
|
||||
so the effective listing count is bounded by both ``max_properties`` and
|
||||
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
|
||||
limit = detail_cap if detail_cap and detail_cap > 0 else None
|
||||
if max_properties is not None:
|
||||
limit = max_properties if limit is None else min(limit, max_properties)
|
||||
|
||||
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
||||
paths = _enumerate_detail_paths(fs, outcode, limit)
|
||||
if not paths:
|
||||
return [], base
|
||||
|
||||
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
|
||||
properties: list[dict] = []
|
||||
dropped = 0
|
||||
for path in paths:
|
||||
if deadline is not None and time.monotonic() >= deadline:
|
||||
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
|
||||
break
|
||||
id_match = _LISTING_ID_RE.search(path)
|
||||
listing_id = id_match.group(1) if id_match else path
|
||||
try:
|
||||
html = fs.get(ZOOPLA_BASE + path)
|
||||
geo = parse_detail_geo(html, search_outcode=outcode)
|
||||
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
|
||||
prop = transform_property(
|
||||
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
|
||||
)
|
||||
except FlareSolverrError as exc:
|
||||
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
|
||||
prop = None
|
||||
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
|
||||
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
|
||||
prop = None
|
||||
if prop:
|
||||
properties.append(prop)
|
||||
else:
|
||||
dropped += 1
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
|
||||
return properties, base
|
||||
Loading…
Add table
Add a link
Reference in a new issue