scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/finder/Dockerfile
+++ b/finder/Dockerfile
@ -0,0 +1,25 @@
+# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
+# network namespace; the source tree is bind-mounted at runtime, so this image
+# only needs the Python deps. The venv lives OUTSIDE the bind-mount target
+# (/opt/venv) so the mount doesn't shadow it.
+FROM python:3.12-slim
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
+    UV_COMPILE_BYTECODE=1 \
+    UV_LINK_MODE=copy \
+    PYTHONUNBUFFERED=1
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends ca-certificates curl \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+WORKDIR /app/finder
+
+# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
+COPY pyproject.toml uv.lock ./
+RUN uv sync --no-install-project --frozen
+
+# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
+CMD ["sleep", "infinity"]
--- a/finder/constants.py
+++ b/finder/constants.py
@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent

 DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
 ARCGIS_PATH = Path(
-    os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
+    os.environ.get(
+        "ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
+    )
 )
 PAGE_SIZE = 24
 DELAY_BETWEEN_PAGES = 0.3
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20  # sanity cap — values above this are almost certainly parsi
 TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
 SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
 RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
+# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
+# propertyData.address.{outcode,incode}, which together form the property's TRUE
+# full postcode — the search API only exposes the outcode. {id} is the numeric
+# listing id from the search response.
+RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
+
+# The Rightmove search API gives only an outcode-level display address, so the
+# true full postcode is recovered from each listing's detail page (see
+# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
+# big throughput increase over the ~1000-result-per-outcode search, so detail
+# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
+RIGHTMOVE_FETCH_DETAILS = True  # fetch detail pages for true per-listing postcodes
+RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000  # max detail-page fetches per outcode

 # OnTheMarket
 ONTHEMARKET_BASE = "https://www.onthemarket.com"
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
 # Zoopla
 ZOOPLA_BASE = "https://www.zoopla.co.uk"

+# Zoopla search cards only carry an outcode-level address, so the full postcode
+# and precise coordinates are scraped from each listing's detail page. These
+# bound that extra work (see finder/zoopla.py and finder/scraper.py).
+ZOOPLA_FETCH_DETAILS = True  # fetch detail pages for precise per-listing postcodes
+ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000  # max detail-page fetches per outcode
+ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000  # per detail-page navigation timeout
+# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
+# spent fetching details; the remainder is reserved for search pagination so
+# detail fetches can never trip the timeout and discard collected listings.
+ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
+
+# Gluetun VPN. Network endpoints are env-overridable because they are
+# deployment-specific: when finder runs in a SEPARATE container they use the
+# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
+# namespace (docker-compose.yml, network_mode container:media_gluetun) they
+# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
+# all traffic, so no HTTP proxy is needed).
+#   GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
+GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
+GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
+GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
+# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
+# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
+# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
+# Raise only with residential IPs where rotation helps.
+GLUETUN_MAX_ROTATIONS = 0  # max egress-IP rotations per Cloudflare challenge
+
+# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
+# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
+# RSC flight stream with postcode + coordinates; "camoufox" drives a local
+# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
+ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
+FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
+FLARESOLVERR_MAX_TIMEOUT_MS = 120000  # per-request solve budget; first solve is slow
+
 # Greater London-ish postcode areas. This intentionally uses broad area
 # prefixes so a manual scrape can include central/inner London plus common
 # outer-London and near-London outcodes without maintaining a long borough list.
--- a/finder/docker-compose.yml
+++ b/finder/docker-compose.yml
@ -0,0 +1,57 @@
+# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
+# container's network namespace. Everything egresses through the VPN, and
+# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
+#
+# Prerequisites:
+#   - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
+#     It is managed by a different compose; it is referenced here as external
+#     via network_mode "container:media_gluetun".
+#   - Because these services share gluetun's netns, they reach each other and
+#     gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
+#     NO published ports (which is exactly why this avoids the dev-container
+#     port-forwarding pain).
+#
+# Usage:
+#   cd finder
+#   docker compose up -d --build flaresolverr finder     # start the sidecars
+#   docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
+#   docker compose exec finder uv run python main.py --source all       # full run
+#   docker compose down
+#
+# NOTE: a manually-started `finder_flaresolverr` container from testing must be
+# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
+
+services:
+  flaresolverr:
+    image: ghcr.io/flaresolverr/flaresolverr:latest
+    container_name: finder_flaresolverr
+    network_mode: "container:media_gluetun"
+    environment:
+      LOG_LEVEL: info
+      TZ: Europe/London
+    restart: unless-stopped
+
+  finder:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: finder-scraper:latest
+    container_name: finder_scraper
+    network_mode: "container:media_gluetun"
+    depends_on:
+      - flaresolverr
+    volumes:
+      - .:/app/finder                          # live-mounted finder source
+      - ../property-data:/app/property-data:ro  # ARCGIS postcode data
+    working_dir: /app/finder
+    environment:
+      # Shared netns: sidecars are on localhost, and the netns already tunnels
+      # all traffic through the VPN, so no HTTP proxy is used.
+      ZOOPLA_FETCHER: flaresolverr
+      FLARESOLVERR_URL: http://localhost:8191/v1
+      GLUETUN_CONTROL_URL: http://localhost:8000
+      GLUETUN_PROXY: ""            # empty => direct (shared netns already tunnels)
+      DATA_DIR: /app/finder/data
+      ARCGIS_PATH: /app/property-data/arcgis_data.parquet
+    restart: "no"
+    command: ["sleep", "infinity"]   # stays up; run scrapes via `docker compose exec`
--- a/finder/flaresolverr.py
+++ b/finder/flaresolverr.py
@ -0,0 +1,91 @@
+"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
+
+FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
+undetected browser to pass Cloudflare's challenge and returns the fully
+rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
+the Gluetun VPN network namespace, so its browser egresses through the VPN.
+
+Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
+provided a reused session and a generous maxTimeout (~120s) — the first
+challenge solve is slow, subsequent requests on the warm session are fast.
+"""
+
+import logging
+
+import httpx
+
+from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
+
+log = logging.getLogger("flaresolverr")
+
+
+class FlareSolverrError(Exception):
+    """Raised when FlareSolverr cannot fetch/solve a URL."""
+
+
+class FlareSolverrSession:
+    """A reusable FlareSolverr browser session (context manager).
+
+    Reusing one session keeps the cleared Cloudflare cookies warm across
+    requests, so only the first fetch pays the full challenge-solve cost."""
+
+    def __init__(
+        self,
+        url: str = FLARESOLVERR_URL,
+        session: str = "finder",
+        max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
+    ) -> None:
+        self._url = url
+        self._session = session
+        self._max_timeout = max_timeout_ms
+        # Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
+        # for up to maxTimeout while solving before responding).
+        self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
+        self._active = False
+
+    def _post(self, payload: dict) -> dict:
+        try:
+            resp = self._client.post(self._url, json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+        except (httpx.HTTPError, ValueError) as exc:
+            raise FlareSolverrError(
+                f"FlareSolverr request to {self._url} failed: {exc}"
+            ) from exc
+        if data.get("status") != "ok":
+            raise FlareSolverrError(
+                f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
+            )
+        return data
+
+    def __enter__(self) -> "FlareSolverrSession":
+        # Start from a clean session (ignore destroy errors for a fresh name).
+        try:
+            self._post({"cmd": "sessions.destroy", "session": self._session})
+        except FlareSolverrError:
+            pass
+        self._post({"cmd": "sessions.create", "session": self._session})
+        self._active = True
+        log.info("FlareSolverr session %r ready at %s", self._session, self._url)
+        return self
+
+    def get(self, url: str) -> str:
+        """Fetch a URL through FlareSolverr; return the solved HTML."""
+        data = self._post(
+            {
+                "cmd": "request.get",
+                "session": self._session,
+                "url": url,
+                "maxTimeout": self._max_timeout,
+            }
+        )
+        solution = data.get("solution") or {}
+        return solution.get("response", "") or ""
+
+    def __exit__(self, *exc_info) -> None:
+        if self._active:
+            try:
+                self._post({"cmd": "sessions.destroy", "session": self._session})
+            except FlareSolverrError as exc:
+                log.debug("FlareSolverr session destroy failed: %s", exc)
+        self._client.close()
--- a/finder/gdal-ecw/Dockerfile
+++ b/finder/gdal-ecw/Dockerfile
@ -0,0 +1,53 @@
+# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
+# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
+#
+# EA VAP ships as ECW **v2** rasters, which are readable by the open-source
+# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
+# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
+# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
+# restrictions entirely.
+#
+# We build only the ECW driver as a GDAL *plugin* on top of the official runtime
+# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
+# commit reported by the base image so libgdal and the plugin stay ABI-compatible.
+#
+# Build:  docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
+# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
+
+FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
+
+ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cmake g++ make git curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
+RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
+    && tar -C / -xzf /tmp/libecwj2.tar.gz \
+    && rm -f /tmp/libecwj2.tar.gz \
+    && (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
+            ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
+        done) \
+    && ldconfig
+
+# Build the ECW driver plugin against the base image's exact GDAL sources.
+RUN set -eux; \
+    GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
+    test -n "$GDAL_COMMIT"; \
+    echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
+    mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
+    git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
+    git checkout -q FETCH_HEAD; \
+    cmake -S frmts/ecw -B /tmp/ecw-build \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_PREFIX_PATH=/usr \
+        -DECW_ROOT=/opt/libecwj2-3.3; \
+    cmake --build /tmp/ecw-build -j"$(nproc)"; \
+    PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
+    mkdir -p "$PLUGIN_DIR"; \
+    find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
+    rm -rf /tmp/gdal /tmp/ecw-build
+
+# Fail the build if the driver is not actually available.
+RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"
--- a/finder/http_client.py
+++ b/finder/http_client.py
@ -5,7 +5,7 @@ import time
 import httpx
 from fake_useragent import UserAgent

-from constants import MAX_RETRIES, RETRY_BASE_DELAY
+from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY

 log = logging.getLogger("rightmove")

@ -15,10 +15,12 @@ _ua = UserAgent(


 def make_client() -> httpx.Client:
+    # Route through the Gluetun HTTP proxy (VPN egress) when configured.
    return httpx.Client(
        timeout=30,
        headers={"User-Agent": _ua.random, "Accept": "application/json"},
        follow_redirects=True,
+        proxy=GLUETUN_PROXY or None,
    )


--- a/finder/main.py
+++ b/finder/main.py
@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
        default=DATA_DIR,
        help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
    )
+    parser.add_argument(
+        "--outcodes",
+        type=str,
+        default=None,
+        help=(
+            "Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
+            "instead of the full Greater London set. Must fall within the "
+            "London-ish areas; takes precedence over --test/--limit-outcodes."
+        ),
+    )
    parser.add_argument(
        "--limit-outcodes",
        type=int,
@ -116,17 +126,32 @@ def main() -> int:
    from scraper import (
        build_postcode_coords,
        build_postcode_index,
+        filter_londonish_outcodes,
        load_outcodes,
        run_scrape,
    )

-    outcodes = load_outcodes()
-    if args.test and args.limit_outcodes is None:
-        preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
-        if preferred:
-            outcodes = preferred
-    if args.limit_outcodes is not None:
-        outcodes = outcodes[: args.limit_outcodes]
+    if args.outcodes is not None:
+        requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
+        if not requested:
+            raise SystemExit("--outcodes was empty")
+        outcodes = filter_londonish_outcodes(requested)
+        dropped = sorted(set(requested) - set(outcodes))
+        if dropped:
+            log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
+        if not outcodes:
+            raise SystemExit(
+                "None of the requested outcodes are within the Greater London-ish areas "
+                f"({', '.join(requested)})."
+            )
+    else:
+        outcodes = load_outcodes()
+        if args.test and args.limit_outcodes is None:
+            preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
+            if preferred:
+                outcodes = preferred
+        if args.limit_outcodes is not None:
+            outcodes = outcodes[: args.limit_outcodes]

    if not outcodes:
        raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
--- a/finder/onthemarket.py
+++ b/finder/onthemarket.py
@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
 `humanised-property-type`, `features` (a list where the first element is
 typically `"Tenure: <value>"`), and `details-url`. Pagination is via
 `?page=N`; the loop terminates when `paginationControls.next` is null.
+
+Postcodes
+---------
+The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
+London, SE5") and a map pin, so the old behaviour derived the postcode from the
+nearest postcode to that pin — a guess that frequently lands on a neighbouring
+unit (the pin can sit on the wrong side of a street boundary).
+
+Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
+embeds the property's analytics dataLayer at
+`props.initialReduxState.metadata.dataLayer`, which carries the property's own
+`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
+`property-id`. Crucially this is NOT the agent's office postcode — that lives
+separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and
+is the classic trap when blindly scanning the page for a postcode. We read the
+dataLayer postcode, verify `property-id` matches the listing, and accept it only
+when its outcode agrees with the coordinate-nearest postcode (via
+``resolve_listing_postcode``) — exactly the trust rule the other scrapers use.
+Measured over a sample of real listings this yields a trustworthy, usually
+exact-unit postcode for ~11/12 listings; the rest safely fall back to the
+coordinate-nearest postcode.
+
+Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
+``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
 """

 import json
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
 from transform import (
    clean_listing_address,
    extract_full_postcode,
+    extract_outcode,
    fix_coords,
    map_property_type,
    normalize_sub_type,
    parse_display_size,
+    resolve_listing_postcode,
 )

 log = logging.getLogger("rightmove")

+# Detail-page postcode recovery (see module docstring). When enabled, each
+# listing's detail page is fetched so its analytics dataLayer postcode — the
+# property's own full unit postcode — can replace the coordinate-nearest guess.
+# Bounded per outcode so a large outcode can't balloon into unbounded extra
+# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
+# typical outcode's listings all get their real postcode rather than a
+# coordinate-nearest guess.
+OTM_FETCH_DETAILS = True
+OTM_MAX_DETAILS_PER_OUTCODE = 400
+
 _NEXT_DATA_RE = re.compile(
    r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
    re.DOTALL,
@ -51,6 +87,11 @@ _HTML_HEADERS = {
    "Accept-Language": "en-GB,en;q=0.9",
 }

+# listingId -> recovered full postcode (or None). Failures are cached too so a
+# broken or postcode-less detail page is not re-fetched within a run (the same
+# listing can reappear across overlapping outcode searches).
+_detail_postcode_cache: dict[str, str | None] = {}
+

 def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
    """GET one search-results page and return the embedded __NEXT_DATA__ JSON.
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
    return None


+def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
+    """Extract the property's own full postcode from an OnTheMarket detail page.
+
+    Pure and network-free so it is unit-testable: callers pass `page.content()`
+    / the GET body and this does the parsing.
+
+    The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
+    ``props.initialReduxState.metadata.dataLayer.postcode`` and is the
+    property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
+    agent's office postcode, which sits separately at
+    ``…property.agent.postcode`` — the trap when scanning a detail page for "a"
+    postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
+    match it, guaranteeing we read this listing's postcode and not a stray one.
+
+    Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
+    page has no usable property postcode. Trust (outcode-vs-coordinates
+    agreement) is enforced later in ``transform_property``.
+    """
+    if not html:
+        return None
+
+    match = _NEXT_DATA_RE.search(html)
+    if not match:
+        return None
+    try:
+        data = json.loads(match.group(1))
+    except json.JSONDecodeError:
+        return None
+
+    try:
+        data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
+    except (KeyError, TypeError):
+        return None
+    if not isinstance(data_layer, dict):
+        return None
+
+    # Guard against reading a different listing's postcode: the dataLayer is the
+    # property's own analytics payload, so its property-id must match.
+    if listing_id is not None:
+        page_id = data_layer.get("property-id")
+        if page_id is not None and str(page_id) != str(listing_id):
+            return None
+
+    raw_postcode = data_layer.get("postcode")
+    if not isinstance(raw_postcode, str):
+        return None
+    return extract_full_postcode(raw_postcode)
+
+
+def _fetch_detail_postcode(
+    client: httpx.Client, details_url: str, listing_id: str
+) -> str | None:
+    """GET one listing's detail page and return its dataLayer postcode (or None).
+
+    Results (including failures) are cached by listing id so a listing that
+    reappears across overlapping outcode searches is fetched at most once. Plain
+    HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network /
+    parse errors degrade gracefully to None so the caller falls back to the
+    coordinate-nearest postcode.
+    """
+    if listing_id in _detail_postcode_cache:
+        return _detail_postcode_cache[listing_id]
+
+    full_url = (
+        ONTHEMARKET_BASE + details_url
+        if details_url and not details_url.startswith("http")
+        else details_url
+    )
+    result: str | None = None
+    if full_url:
+        for attempt in range(MAX_RETRIES):
+            try:
+                resp = client.get(
+                    full_url, headers=_HTML_HEADERS, follow_redirects=True
+                )
+            except (
+                httpx.ConnectError,
+                httpx.ReadTimeout,
+                httpx.WriteTimeout,
+                httpx.PoolTimeout,
+            ) as exc:
+                delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
+                log.warning(
+                    "%s from %s, retry %d/%d in %.1fs",
+                    type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
+                )
+                time.sleep(delay)
+                continue
+
+            if resp.status_code == 200:
+                result = parse_detail_postcode(resp.text, listing_id)
+                break
+            if resp.status_code in (429, 500, 502, 503, 504):
+                delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
+                log.warning(
+                    "HTTP %d from %s, retry %d/%d in %.1fs",
+                    resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
+                )
+                time.sleep(delay)
+                continue
+            log.debug(
+                "OnTheMarket detail %s returned HTTP %d (no postcode)",
+                listing_id, resp.status_code,
+            )
+            break
+
+    _detail_postcode_cache[listing_id] = result
+    return result
+
+
 def _parse_price(price_value) -> int:
    """Parse a formatted price string like '£450,000' into an integer.
    Returns 0 for POA/auction/null values."""
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:


 def transform_property(
-    raw: dict, pc_index: PostcodeSpatialIndex
+    raw: dict,
+    pc_index: PostcodeSpatialIndex,
+    detail_postcode: str | None = None,
 ) -> dict | None:
-    """Transform a raw OnTheMarket listing dict into our output schema."""
+    """Transform a raw OnTheMarket listing dict into our output schema.
+
+    ``detail_postcode`` is the property's own full postcode recovered from its
+    detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
+    or ``None`` when no detail fetch was done / no postcode was found. When
+    present and trustworthy (its outcode agrees with the coordinate-nearest
+    postcode) it supersedes the coordinate guess and is labelled
+    ``"detail_address"``.
+    """
    loc = raw.get("location") or {}
    raw_lat = loc.get("lat")
    raw_lng = loc.get("lon")
@ -184,8 +345,29 @@ def transform_property(
        return None
    raw_address = raw.get("address", "") or ""
    extracted_postcode = extract_full_postcode(raw_address)
-    postcode = extracted_postcode or inferred_postcode
-    postcode_source = "address" if extracted_postcode else "coordinates"
+
+    # Prefer the property's own detail-page postcode when we have one and it is
+    # trustworthy. The detail postcode is a full unit postcode (better than the
+    # coordinate-nearest guess and than the usually outcode-only card address),
+    # but a stale/mislabelled value would silently override the spatially
+    # correct one, so apply the same outcode-agreement trust rule the address
+    # postcode uses: keep it only when its outcode matches the
+    # coordinate-nearest postcode's outcode.
+    detail_postcode = extract_full_postcode(detail_postcode)
+    if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
+        inferred_postcode
+    ):
+        postcode, postcode_source = detail_postcode, "detail_address"
+    else:
+        if detail_postcode:
+            log.debug(
+                "OnTheMarket %s: rejecting detail postcode %s "
+                "(outcode mismatch with inferred %s)",
+                raw.get("id", "?"), detail_postcode, inferred_postcode,
+            )
+        postcode, postcode_source = resolve_listing_postcode(
+            extracted_postcode, inferred_postcode
+        )

    raw_beds = raw.get("bedrooms") or 0
    raw_baths = raw.get("bathrooms") or 0
@ -223,6 +405,10 @@ def transform_property(
        "Inferred postcode": inferred_postcode,
        "Listing raw address": raw_address,
        "Address per Property Register": clean_listing_address(raw_address),
+        # OnTheMarket search JSON exposes only a street-level address; no UPRN
+        # or house number/name is available without a detail-page fetch.
+        "UPRN": None,
+        "Property number or name": None,
        "Leasehold/Freehold": _extract_tenure(features),
        "Property type": map_property_type(sub_type),
        "Property sub-type": normalize_sub_type(sub_type),
@ -242,10 +428,17 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
 ) -> list[dict]:
-    """Paginate through OnTheMarket sale results for one outcode."""
+    """Paginate through OnTheMarket sale results for one outcode.
+
+    When ``OTM_FETCH_DETAILS`` is enabled, up to
+    ``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
+    fetched for the property's own postcode (see ``_fetch_detail_postcode``);
+    the rest fall back to the coordinate-nearest postcode.
+    """
    properties: list[dict] = []
    seen_ids: set[str] = set()
    page_num = 1
+    details_fetched = 0

    while True:
        data = _fetch_page_json(client, outcode, page_num)
@ -269,8 +462,22 @@ def search_outcode(
            if listing_id and listing_id in seen_ids:
                continue
            seen_ids.add(listing_id)
+
+            detail_postcode = None
+            if OTM_FETCH_DETAILS and listing_id:
+                # Cached lookups are free; only fresh GETs count toward the cap
+                # and incur the inter-request delay.
+                cached = listing_id in _detail_postcode_cache
+                if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
+                    detail_postcode = _fetch_detail_postcode(
+                        client, raw.get("details-url") or "", listing_id
+                    )
+                    if not cached:
+                        details_fetched += 1
+                        time.sleep(DELAY_BETWEEN_PAGES)
+
            try:
-                transformed = transform_property(raw, pc_index)
+                transformed = transform_property(raw, pc_index, detail_postcode)
            except Exception as exc:
                log.warning(
                    "OnTheMarket %s property %s failed to transform: %s",
--- a/finder/rightmove.py
+++ b/finder/rightmove.py
@ -1,4 +1,6 @@
+import json
 import logging
+import re
 import time

 import httpx
@ -6,12 +8,15 @@ import httpx
 from constants import (
    PAGE_SIZE,
    DELAY_BETWEEN_PAGES,
+    RIGHTMOVE_DETAIL_URL,
+    RIGHTMOVE_FETCH_DETAILS,
+    RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    SEARCH_URL,
    TYPEAHEAD_URL,
 )
 from http_client import fetch_with_retry
 from spatial import PostcodeSpatialIndex
-from transform import transform_property
+from transform import extract_full_postcode, normalize_postcode, transform_property

 log = logging.getLogger("rightmove")

@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
 _MAX_INDEX = 1008


+# ---------------------------------------------------------------------------
+# Detail-page postcode extraction
+# ---------------------------------------------------------------------------
+#
+# The search API (_paginate) only returns an outcode-level `displayAddress`
+# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
+# listing's detail page, however, embeds the property's OWN full postcode in a
+# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
+# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
+# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
+# This is the property's own postcode, NOT a nearest station/school: the
+# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
+# postcodes, and the address outcode always matches the searched outcode.
+# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
+# new-builds) found the full postcode present 100% of the time. There is no
+# UPRN or house-number field anywhere in propertyData, so those stay None.
+#
+# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
+# a JSON STRING holding a flat array where every integer inside a container is
+# an index reference into that same array (so the graph can dedupe). We
+# brace-match the (large, deeply-nested) object literal — a non-greedy regex
+# cannot — then rehydrate the reference graph before reading the address.
+
+_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
+
+
+def _extract_page_model_literal(html: str) -> str | None:
+    """Return the `{...}` object literal assigned to window.__PAGE_MODEL.
+
+    Brace-matches with string/escape awareness so embedded braces and quotes in
+    string values don't end the match early. Returns None when absent."""
+    marker = _PAGE_MODEL_RE.search(html)
+    if not marker:
+        return None
+    start = marker.end()
+    if start >= len(html) or html[start] != "{":
+        return None
+    depth = 0
+    in_str = False
+    esc = False
+    for j in range(start, len(html)):
+        ch = html[j]
+        if in_str:
+            if esc:
+                esc = False
+            elif ch == "\\":
+                esc = True
+            elif ch == '"':
+                in_str = False
+        elif ch == '"':
+            in_str = True
+        elif ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return html[start : j + 1]
+    return None
+
+
+def _rehydrate(flat: list) -> object:
+    """Resolve a devalue-style flattened reference array into a nested object.
+
+    Index 0 is the root; every int inside a dict/list is an index back into
+    ``flat``. Memoised so shared/cyclic references resolve once."""
+    cache: dict[int, object] = {}
+
+    def resolve(idx: int) -> object:
+        if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
+            return None
+        if idx in cache:
+            return cache[idx]
+        node = flat[idx]
+        if isinstance(node, dict):
+            out: dict = {}
+            cache[idx] = out
+            for key, value in node.items():
+                out[key] = resolve(value) if isinstance(value, int) else value
+            return out
+        if isinstance(node, list):
+            arr: list = []
+            cache[idx] = arr
+            for value in node:
+                arr.append(resolve(value) if isinstance(value, int) else value)
+            return arr
+        cache[idx] = node
+        return node
+
+    return resolve(0)
+
+
+def parse_detail_postcode(html: str) -> str | None:
+    """Extract a Rightmove property's TRUE full postcode from its detail HTML.
+
+    Pure and network-free so it is unit-testable: callers pass the page HTML.
+    Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
+    and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
+    page has no parseable address (the property location wrapper can be empty —
+    the caller then keeps the coordinate fallback). The returned outcode is
+    re-validated against the joined postcode so a malformed incode is dropped.
+    """
+    if not html:
+        return None
+    literal = _extract_page_model_literal(html)
+    if not literal:
+        return None
+    try:
+        outer = json.loads(literal)
+        flat = json.loads(outer["data"])
+    except (ValueError, KeyError, TypeError):
+        return None
+    if not isinstance(flat, list) or not flat:
+        return None
+
+    root = _rehydrate(flat)
+    if not isinstance(root, dict):
+        return None
+    property_data = root.get("propertyData")
+    if not isinstance(property_data, dict):
+        return None
+    address = property_data.get("address")
+    if not isinstance(address, dict):
+        return None
+
+    outcode = address.get("outcode")
+    incode = address.get("incode")
+    if not isinstance(outcode, str) or not isinstance(incode, str):
+        return None
+    outcode, incode = outcode.strip(), incode.strip()
+    if not outcode or not incode:
+        return None
+
+    # Round-trip through the shared postcode validator/normaliser: this both
+    # canonicalises spacing and rejects an outcode/incode pair that doesn't form
+    # a structurally-valid UK postcode.
+    return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
+
+
+# listingId -> true full postcode (or None when unavailable). Failures are
+# cached too, so a broken/duplicate listing is fetched at most once per run (the
+# same listing can reappear across overlapping outcode searches).
+_detail_postcode_cache: dict[str, str | None] = {}
+
+
+def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
+    """GET a listing detail page and return its true full postcode (or None).
+
+    Results (including failures) are cached by listing id. The detail page is a
+    plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
+    suffices; any error degrades gracefully to the coordinate fallback."""
+    if not property_id:
+        return None
+    if property_id in _detail_postcode_cache:
+        return _detail_postcode_cache[property_id]
+
+    postcode: str | None = None
+    url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
+    try:
+        resp = client.get(url, headers={"Accept": "text/html"})
+        if resp.status_code == 200:
+            postcode = parse_detail_postcode(resp.text)
+        else:
+            log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
+    except httpx.HTTPError as exc:
+        log.debug("Rightmove detail fetch failed %s: %s", url, exc)
+
+    _detail_postcode_cache[property_id] = postcode
+    return postcode
+
+
 def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
    """Look up Rightmove's internal ID for an outcode via typeahead API."""
    if outcode in outcode_cache:
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
    return None


+def _detail_postcode_for(
+    client: httpx.Client,
+    prop: dict,
+    fetch_details: bool,
+    detail_budget: dict,
+) -> str | None:
+    """Look up a listing's true postcode, honouring the per-outcode fetch cap.
+
+    Cached listings are always served (they cost neither a cap slot nor a GET);
+    a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
+    if not fetch_details:
+        return None
+    property_id = str(prop.get("id") or "")
+    if not property_id:
+        return None
+    if property_id in _detail_postcode_cache:
+        return _detail_postcode_cache[property_id]
+    if detail_budget["remaining"] <= 0:
+        return None
+    detail_budget["remaining"] -= 1
+    postcode = _fetch_detail_postcode(client, property_id)
+    time.sleep(DELAY_BETWEEN_PAGES)
+    return postcode
+
+
 def _paginate(
    client: httpx.Client,
    outcode_id: str,
@ -51,11 +251,19 @@ def _paginate(
    channel_cfg: dict,
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
+    fetch_details: bool = False,
+    detail_cap: int = 0,
 ) -> tuple[list[dict], int]:
-    """Paginate through search results. Returns (properties, result_count)."""
+    """Paginate through search results. Returns (properties, result_count).
+
+    When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
+    their detail page fetched for the property's TRUE full postcode (see
+    ``parse_detail_postcode``); the rest fall back to coordinate-derived
+    postcodes."""
    properties = []
    index = 0
    result_count = 0
+    detail_budget = {"remaining": detail_cap}

    while True:
        params = {
@ -82,7 +290,12 @@ def _paginate(

        for prop in raw_props:
            try:
-                transformed = transform_property(prop, outcode, pc_index)
+                detail_postcode = _detail_postcode_for(
+                    client, prop, fetch_details, detail_budget
+                )
+                transformed = transform_property(
+                    prop, outcode, pc_index, detail_postcode=detail_postcode
+                )
            except Exception as exc:
                log.warning(
                    "Rightmove %s/%s property %s failed to transform: %s",
@ -127,7 +340,12 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
 ) -> list[dict]:
-    """Paginate through unfiltered sale results for one outcode+channel."""
+    """Paginate through unfiltered sale results for one outcode+channel.
+
+    Each listing's detail page is fetched for the property's TRUE full postcode
+    (gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
+    ``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
+    coordinate-derived postcode."""
    properties, _ = _paginate(
        client,
        outcode_id,
@ -135,6 +353,8 @@ def search_outcode(
        channel_cfg,
        pc_index,
        max_properties=max_properties,
+        fetch_details=RIGHTMOVE_FETCH_DETAILS,
+        detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    )

    if max_properties is not None and len(properties) >= max_properties:
--- a/finder/scraper.py
+++ b/finder/scraper.py
@ -15,6 +15,10 @@ from constants import (
    DATA_DIR,
    DELAY_BETWEEN_OUTCODES,
    LONDON_OUTCODE_PREFIXES,
+    ZOOPLA_DETAIL_BUDGET_FRACTION,
+    ZOOPLA_FETCH_DETAILS,
+    ZOOPLA_FETCHER,
+    ZOOPLA_MAX_DETAILS_PER_OUTCODE,
 )

 from http_client import make_client
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
    return timeout


+def _zoopla_detail_cap() -> int:
+    """Max detail-page fetches per outcode (0 disables detail fetching).
+
+    Zoopla search cards only expose an outcode-level address, so the full
+    postcode/coordinates come from each listing's detail page. The cap bounds
+    the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
+    (the per-outcode SIGALRM budget covers the detail fetches too). Configure via
+    ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
+    return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
+
+
+def _open_zoopla_detail_tab(page, detail_cap: int):
+    """Open a second tab on the same context for detail-page fetches.
+
+    Sharing the persistent context means the detail tab inherits the search
+    tab's Cloudflare clearance cookies. Returns None when detail fetching is
+    disabled or the tab cannot be created (the scrape then degrades to
+    outcode-level postcodes rather than failing)."""
+    if detail_cap <= 0:
+        return None
+    try:
+        return page.context.new_page()
+    except Exception as exc:
+        log.warning(
+            "Zoopla detail tab unavailable (%s); using outcode-level postcodes",
+            _exception_detail(exc),
+        )
+        return None
+
+
@contextmanager
 def _wall_clock_timeout(seconds: int, label: str):
    """SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
        log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))


+def _scrape_zoopla_flaresolverr(
+    outcodes: list[str],
+    pc_index: PostcodeSpatialIndex,
+    pc_coords: dict[str, tuple[float, float]],
+    results: dict[str, list[dict]],
+    errors: list[str],
+    max_properties_per_source: int | None,
+) -> None:
+    """Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
+    from flaresolverr import FlareSolverrError, FlareSolverrSession
+    from zoopla_flaresolverr import search_outcode as fs_search_outcode
+
+    try:
+        session = FlareSolverrSession(session="zoopla")
+        session.__enter__()
+    except FlareSolverrError as exc:
+        errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
+        log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
+        return
+
+    try:
+        for outcode in outcodes:
+            remaining = _source_remaining(results, "zoopla", max_properties_per_source)
+            if remaining == 0:
+                log.info("Zoopla cap reached")
+                return
+            try:
+                props, _ = fs_search_outcode(
+                    outcode,
+                    pc_index,
+                    pc_coords,
+                    session,
+                    max_properties=remaining,
+                    detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
+                )
+                added = _store_properties(results, "zoopla", props, max_properties_per_source)
+                log.info("Zoopla %s: +%d", outcode, added)
+            except Exception as exc:  # noqa: BLE001 - one outcode must not kill the run
+                _record_error(errors, "zoopla", outcode, exc)
+            time.sleep(DELAY_BETWEEN_OUTCODES)
+    finally:
+        session.__exit__(None, None, None)
+
+
 def _scrape_zoopla(
    outcodes: list[str],
    pc_index: PostcodeSpatialIndex,
@ -446,6 +524,12 @@ def _scrape_zoopla(
    errors: list[str],
    max_properties_per_source: int | None,
 ) -> None:
+    if ZOOPLA_FETCHER == "flaresolverr":
+        _scrape_zoopla_flaresolverr(
+            outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
+        )
+        return
+
    try:
        browser, page = _launch_zoopla_with_retries()
    except Exception as exc:
@ -454,6 +538,12 @@ def _scrape_zoopla(
        return

    outcode_timeout = _zoopla_outcode_timeout_seconds()
+    detail_cap = _zoopla_detail_cap()
+    detail_page = _open_zoopla_detail_tab(page, detail_cap)
+    # Spend at most a fraction of each outcode's budget on detail fetches so the
+    # SIGALRM guard never trips mid-outcode and discards already-collected
+    # search listings; the rest is left for search pagination and transform.
+    detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)

    try:
        for outcode in outcodes:
@ -470,6 +560,9 @@ def _scrape_zoopla(
                            pc_index,
                            pc_coords,
                            max_properties=None,
+                            detail_page=detail_page,
+                            detail_cap=detail_cap,
+                            detail_budget_seconds=detail_budget_seconds,
                        )
                    added = _store_properties(
                        results,
@ -496,6 +589,8 @@ def _scrape_zoopla(
                    _close_zoopla_browser(browser, f"zoopla {outcode}")
                    try:
                        browser, page = _launch_zoopla_with_retries()
+                        # The old context (and its detail tab) is gone; reopen one.
+                        detail_page = _open_zoopla_detail_tab(page, detail_cap)
                        log.info("Zoopla %s retrying with fresh browser", outcode)
                    except Exception as relaunch_exc:
                        _record_error(errors, "zoopla", outcode, relaunch_exc)
@ -503,6 +598,11 @@ def _scrape_zoopla(

            time.sleep(DELAY_BETWEEN_OUTCODES)
    finally:
+        if detail_page is not None:
+            try:
+                detail_page.close()
+            except Exception:
+                pass
        _close_zoopla_browser(browser, "zoopla final")


--- a/finder/storage.py
+++ b/finder/storage.py
@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
            "Address per Property Register": [
                p["Address per Property Register"] for p in properties
            ],
+            # UPRN (when the scraper recovered it) keys an exact listing->EPC
+            # join; Property number or name is the house identifier for the
+            # Price-Paid address join. Both are None for sources/listings without
+            # a detail-page fetch.
+            "UPRN": [p.get("UPRN") for p in properties],
+            "Property number or name": [
+                p.get("Property number or name") for p in properties
+            ],
            "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
            "Property type": [p["Property type"] for p in properties],
            "Property sub-type": [p["Property sub-type"] for p in properties],
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
            "Inferred postcode": pl.Utf8,
            "Listing raw address": pl.Utf8,
            "Address per Property Register": pl.Utf8,
+            "UPRN": pl.Utf8,
+            "Property number or name": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Property type": pl.Utf8,
            "Property sub-type": pl.Utf8,
--- a/finder/test_onthemarket.py
+++ b/finder/test_onthemarket.py
@ -0,0 +1,206 @@
+"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
+
+`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
+or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
+detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
+the property's own postcode lives in the analytics dataLayer
+(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
+postcode sits separately under `…property.agent.postcode` — the trap we must not
+fall into.
+"""
+
+import json
+
+import onthemarket
+from onthemarket import parse_detail_postcode, transform_property
+
+
+class _StubIndex:
+    """Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
+
+    def __init__(self, postcode: str | None):
+        self._postcode = postcode
+
+    def nearest(self, lat: float, lng: float) -> str | None:
+        return self._postcode
+
+
+def _detail_html(
+    *,
+    property_id: int = 19522441,
+    datalayer_postcode: str = "SE5 9AA",
+    agent_postcode: str = "SE5 8RS",
+) -> str:
+    """Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
+    next_data = {
+        "props": {
+            "initialReduxState": {
+                "metadata": {
+                    "dataLayer": {
+                        "page-type": "details-section",
+                        "property-type": "homes",
+                        # The property's own unit postcode.
+                        "postcode": datalayer_postcode,
+                        "property-id": property_id,
+                        "price": "275,000",
+                        "addressline_2": "Padfield Road",
+                    }
+                },
+                "property": {
+                    "displayAddress": "Padfield Road, London, SE5",
+                    "location": {"lon": -0.100233, "lat": 51.466129},
+                    # The agent block carries the AGENT'S office postcode — the
+                    # trap. parse_detail_postcode must not return this.
+                    "agent": {
+                        "address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
+                        "postcode": agent_postcode,
+                    },
+                },
+            }
+        }
+    }
+    payload = json.dumps(next_data)
+    return (
+        "<html><body>"
+        '<script id="__NEXT_DATA__" type="application/json">'
+        f"{payload}"
+        "</script></body></html>"
+    )
+
+
+# ---------------------------------------------------------------------------
+# parse_detail_postcode
+# ---------------------------------------------------------------------------
+
+
+def test_parse_returns_property_postcode_not_agent():
+    html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
+    assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
+
+
+def test_parse_normalizes_spacing():
+    html = _detail_html(datalayer_postcode="se59aa")
+    assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
+
+
+def test_parse_ignores_mismatched_property_id():
+    # dataLayer postcode belongs to property 19522441; asking for a different
+    # listing id must refuse to return it.
+    html = _detail_html(property_id=19522441)
+    assert parse_detail_postcode(html, "99999999") is None
+
+
+def test_parse_accepts_when_no_listing_id_given():
+    html = _detail_html(datalayer_postcode="SE5 9AA")
+    assert parse_detail_postcode(html, None) == "SE5 9AA"
+
+
+def test_parse_handles_missing_postcode():
+    html = _detail_html(datalayer_postcode="")
+    assert parse_detail_postcode(html, "19522441") is None
+
+
+def test_parse_handles_no_next_data():
+    assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
+
+
+def test_parse_handles_empty_html():
+    assert parse_detail_postcode("", "1") is None
+
+
+def test_parse_handles_malformed_json():
+    html = (
+        '<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
+    )
+    assert parse_detail_postcode(html, "1") is None
+
+
+def test_parse_handles_missing_datalayer():
+    next_data = {"props": {"initialReduxState": {"metadata": {}}}}
+    html = (
+        '<script id="__NEXT_DATA__" type="application/json">'
+        f"{json.dumps(next_data)}</script>"
+    )
+    assert parse_detail_postcode(html, "1") is None
+
+
+# ---------------------------------------------------------------------------
+# transform_property — detail postcode wiring + trust rule
+# ---------------------------------------------------------------------------
+
+
+_RAW_LISTING = {
+    "id": "19522441",
+    "address": "Padfield Road, London, SE5",
+    "location": {"lon": -0.100233, "lat": 51.466129},
+    "bedrooms": 2,
+    "bathrooms": 1,
+    "price": "£275,000",
+    "humanised-property-type": "Apartment",
+    "features": ["Tenure: Leasehold (99 years remaining)"],
+    "details-url": "/details/19522441/",
+}
+
+
+def test_transform_uses_trusted_detail_postcode():
+    # Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
+    # the (more precise) detail postcode and label it detail_address.
+    index = _StubIndex("SE5 1AA")
+    out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
+    assert out is not None
+    assert out["Postcode"] == "SE5 9AA"
+    assert out["Postcode source"] == "detail_address"
+
+
+def test_transform_rejects_detail_postcode_on_outcode_mismatch():
+    # Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
+    # outcode -> reject the detail postcode, fall back to coordinate logic.
+    index = _StubIndex("SE5 1AA")
+    out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
+    assert out is not None
+    assert out["Postcode"] == "SE5 1AA"
+    assert out["Postcode source"] == "coordinates"
+
+
+def test_transform_without_detail_postcode_uses_coordinates():
+    index = _StubIndex("SE5 1AA")
+    out = transform_property(_RAW_LISTING, index, detail_postcode=None)
+    assert out is not None
+    assert out["Postcode"] == "SE5 1AA"
+    assert out["Postcode source"] == "coordinates"
+    # No UPRN / house number is recoverable from OnTheMarket.
+    assert out["UPRN"] is None
+    assert out["Property number or name"] is None
+
+
+def test_transform_detail_postcode_via_search_address_outcode():
+    # When the card address already carries a full postcode that agrees with the
+    # coordinates, the existing "address" source still wins absent a detail
+    # postcode — detail recovery never regresses that path.
+    raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
+    index = _StubIndex("SE5 1AA")
+    out = transform_property(raw, index, detail_postcode=None)
+    assert out["Postcode"] == "SE5 1AA"
+    assert out["Postcode source"] == "address"
+
+
+# ---------------------------------------------------------------------------
+# _fetch_detail_postcode caching (no real network)
+# ---------------------------------------------------------------------------
+
+
+def test_fetch_detail_postcode_is_cached(monkeypatch):
+    onthemarket._detail_postcode_cache.clear()
+    onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
+
+    def _boom(*args, **kwargs):  # pragma: no cover - must never be called
+        raise AssertionError("network was hit despite a cached value")
+
+    # Any httpx use would explode; the cache hit must short-circuit first.
+    result = onthemarket._fetch_detail_postcode(
+        client=type("C", (), {"get": _boom})(),
+        details_url="/details/19522441/",
+        listing_id="19522441",
+    )
+    assert result == "SE5 9AA"
+    onthemarket._detail_postcode_cache.clear()
--- a/finder/test_rightmove.py
+++ b/finder/test_rightmove.py
@ -0,0 +1,113 @@
+"""Tests for the Rightmove detail-page postcode extractor.
+
+The search API only returns an outcode-level ``displayAddress``; the property's
+TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
+``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
+it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
+graph whose ``data`` field is a JSON STRING of a flat array where every integer
+inside a container is an index reference into that same array.
+"""
+
+import json
+
+from rightmove import _extract_page_model_literal, parse_detail_postcode
+
+
+def _page_model_html(flat: list, *, encoding: str = "json") -> str:
+    """Wrap a flattened object-graph array in a realistic detail-page <script>.
+
+    Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
+    where the array is itself JSON-encoded (so its quotes arrive escaped)."""
+    outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
+    return (
+        "<html><head></head><body>\n"
+        "<script>\n"
+        "    window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
+        "</script>\n"
+        "</body></html>"
+    )
+
+
+# A faithful slice of a real listing: root -> propertyData -> address, with a
+# decoy nearestStations array (which carries NO postcodes on the live page) to
+# prove the parser anchors on the property's own address, not a nearby POI.
+_FLAT_SW9 = [
+    {"propertyData": 1},  # 0: root
+    {
+        "id": "89089584",
+        "address": 2,
+        "location": 4,
+        "nearestStations": 6,
+    },  # 1: propertyData
+    {
+        "displayAddress": "Caldwell Street, Stockwell",
+        "countryCode": "GB",
+        "ukCountry": "England",
+        "outcode": "SW9",
+        "incode": "0HD",
+    },  # 2: address
+    None,  # 3: filler
+    {
+        "latitude": 51.477238,
+        "longitude": -0.116819,
+        "pinType": "ACCURATE_POINT",
+    },  # 4: location
+    None,  # 5: filler
+    [7, 8],  # 6: nearestStations (references)
+    {"name": "Oval Station", "distance": 0.36},  # 7: station, no postcode
+    {"name": "Stockwell Station", "distance": 0.41},  # 8: station, no postcode
+]
+
+
+def test_parses_full_postcode_from_outcode_and_incode() -> None:
+    html = _page_model_html(_FLAT_SW9)
+    assert parse_detail_postcode(html) == "SW9 0HD"
+
+
+def test_extract_page_model_literal_brace_matches_nested_object() -> None:
+    # The literal must include the whole nested object, not stop at the first
+    # closing brace inside the escaped data string.
+    html = _page_model_html(_FLAT_SW9)
+    literal = _extract_page_model_literal(html)
+    assert literal is not None
+    assert literal.startswith("{") and literal.endswith("}")
+    # Round-trips back to a dict with the expected top-level keys.
+    assert set(json.loads(literal)) == {"data", "encoding"}
+
+
+def test_normalises_unspaced_incode() -> None:
+    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
+    flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
+    assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
+
+
+def test_returns_none_when_address_missing() -> None:
+    # The location wrapper can be empty/absent on some listings; the caller then
+    # keeps the coordinate fallback, so we must return None (not raise).
+    flat = [
+        {"propertyData": 1},
+        {"id": "1", "location": 2},
+        {"latitude": 51.5, "longitude": -0.1},
+    ]
+    assert parse_detail_postcode(_page_model_html(flat)) is None
+
+
+def test_returns_none_when_incode_blank() -> None:
+    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
+    flat[2] = {**_FLAT_SW9[2], "incode": ""}
+    assert parse_detail_postcode(_page_model_html(flat)) is None
+
+
+def test_returns_none_for_non_postcode_pair() -> None:
+    # A structurally-invalid outcode/incode pair is rejected by the validator.
+    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
+    flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
+    assert parse_detail_postcode(_page_model_html(flat)) is None
+
+
+def test_returns_none_without_page_model() -> None:
+    assert parse_detail_postcode("") is None
+    assert parse_detail_postcode("<html><body>no model</body></html>") is None
+    # Malformed JSON in the data field degrades gracefully.
+    broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
+    assert parse_detail_postcode(broken) is None
--- a/finder/test_transform.py
+++ b/finder/test_transform.py
@ -1,13 +1,19 @@
 from transform import (
+    build_register_address,
    clean_listing_address,
    extract_full_postcode,
+    extract_outcode,
+    resolve_listing_postcode,
    transform_property,
 )


 class StubPostcodeIndex:
+    def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
+        self._postcode = postcode
+
    def nearest(self, lat: float, lng: float) -> str:
-        return "SW1A 9ZZ"
+        return self._postcode


 def test_extract_full_postcode_normalizes_spacing() -> None:
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
    assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"


+def test_build_register_address_prepends_house_number_or_name() -> None:
+    # House number/name prepended, with the trailing outcode/postcode stripped.
+    assert (
+        build_register_address("South Street, Bromley BR1", "12")
+        == "12, South Street, Bromley"
+    )
+    assert (
+        build_register_address("Riverside, Martham NR29", "Martham Mill")
+        == "Martham Mill, Riverside, Martham"
+    )
+    # No number/name -> identical to the plain cleaned address.
+    assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
+    # Already starts with the number/name -> no duplication.
+    assert (
+        build_register_address("12 South Street, Bromley", "12")
+        == "12 South Street, Bromley"
+    )
+    # Empty/whitespace number/name is ignored.
+    assert build_register_address("Kings Avenue, Bromley", "  ") == "Kings Avenue, Bromley"
+
+
+def test_extract_outcode() -> None:
+    assert extract_outcode("SW1A 2AA") == "SW1A"
+    assert extract_outcode("n4 2ha") == "N4"
+    assert extract_outcode("SW1A2AA") == "SW1A"
+    assert extract_outcode(None) is None
+    assert extract_outcode("") is None
+
+
+def test_resolve_listing_postcode() -> None:
+    # Outcode matches -> trust the more precise extracted postcode.
+    assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
+    # Outcode mismatch -> fall back to the spatially-correct inferred postcode.
+    assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
+    # Well-formed but fabricated postcode in a different outcode is rejected.
+    assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
+    # No extracted postcode -> inferred is authoritative.
+    assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
+
+
 def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
    prop = {
        "id": "123",
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
    assert result["Inferred postcode"] == "SW1A 9ZZ"
    assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
    assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
+
+
+def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
+    prop = {
+        "id": "124",
+        "location": {"latitude": 51.5, "longitude": -0.1},
+        "price": {"amount": 750000, "displayPrices": []},
+        "propertySubType": "Terraced",
+        "bedrooms": 3,
+        "bathrooms": 1,
+        "keyFeatures": [],
+        "propertyUrl": "/properties/124",
+        # Address postcode is in a different outcode than the coordinate-nearest one.
+        "displayAddress": "10 Downing Street, E14 9SS",
+    }
+
+    result = transform_property(prop, "SW1A", StubPostcodeIndex())
+
+    assert result is not None
+    # The spatially-correct inferred postcode wins over the mismatching extracted one.
+    assert result["Postcode"] == "SW1A 9ZZ"
+    assert result["Postcode source"] == "coordinates"
+    assert result["Extracted postcode"] == "E14 9SS"
+
+
+def _rightmove_prop() -> dict:
+    return {
+        "id": "200",
+        "location": {"latitude": 51.5, "longitude": -0.1},
+        "price": {"amount": 750000, "displayPrices": []},
+        "propertySubType": "Terraced",
+        "bedrooms": 3,
+        "bathrooms": 1,
+        "keyFeatures": [],
+        "propertyUrl": "/properties/200",
+        # Search API only ever exposes the outcode in the display address.
+        "displayAddress": "Caldwell Street, Stockwell, SW9",
+    }
+
+
+def test_rightmove_transform_prefers_detail_postcode() -> None:
+    # The detail page's true full postcode (same outcode as the location) is
+    # preferred over the coordinate-nearest guess.
+    result = transform_property(
+        _rightmove_prop(),
+        "SW9",
+        StubPostcodeIndex("SW9 7AA"),
+        detail_postcode="SW9 0HD",
+    )
+
+    assert result is not None
+    assert result["Postcode"] == "SW9 0HD"
+    assert result["Postcode source"] == "detail_address"
+    # The coordinate inference is still surfaced separately.
+    assert result["Inferred postcode"] == "SW9 7AA"
+
+
+def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
+    # A detail postcode whose outcode disagrees with the location must not
+    # relocate the listing; the coordinate postcode wins instead.
+    result = transform_property(
+        _rightmove_prop(),
+        "SW9",
+        StubPostcodeIndex("SW9 7AA"),
+        detail_postcode="E14 9SS",
+    )
+
+    assert result is not None
+    assert result["Postcode"] == "SW9 7AA"
+    assert result["Postcode source"] == "coordinates"
+
+
+def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
+    # No detail postcode -> behaviour is unchanged (coordinate-nearest).
+    result = transform_property(
+        _rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
+    )
+
+    assert result is not None
+    assert result["Postcode"] == "SW9 7AA"
+    assert result["Postcode source"] == "coordinates"
--- a/finder/test_zoopla.py
+++ b/finder/test_zoopla.py
@ -0,0 +1,288 @@
+from zoopla import _detail_cache_key, parse_detail_geo, transform_property
+
+
+def test_detail_cache_key_uses_listing_id() -> None:
+    assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
+    assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
+    # No id in the URL -> fall back to the URL itself as the key.
+    assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
+
+
+class StubPostcodeIndex:
+    """Spatial index stub whose nearest-lookup returns a fixed postcode."""
+
+    def __init__(self, postcode: str = "BR1 2AB") -> None:
+        self._postcode = postcode
+
+    def nearest(self, lat: float, lng: float) -> str:
+        return self._postcode
+
+
+# London-ish postcodes with coordinates, plus the Norfolk sample used by the
+# verified detail-page snippet (well inside the England bounds check).
+PC_COORDS = {
+    "BR1 2AB": (51.40, 0.01),
+    "SW1A 1AA": (51.50, -0.14),
+    "NR29 4RG": (52.716014, 1.614495),
+}
+
+# Verified RSC `location` object (listing 59888978), as it appears escaped inside
+# a self.__next_f flight chunk in page.content().
+_LOCATION_ESCAPED = (
+    '<script>self.__next_f.push([1,"...'
+    '\\"location\\":{\\"outcode\\":\\"NR29\\",'
+    '\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
+    '\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
+    '\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
+    '..."])</script>'
+)
+
+
+def test_parse_detail_geo_location_object_escaped() -> None:
+    geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
+    assert geo == {
+        "lat": 52.716014,
+        "lng": 1.614495,
+        "postcode": "NR29 4RG",
+        "outcode": "NR29",
+        "source": "detail_location",
+        "uprn": "10023461458",
+        "number_or_name": "Martham Mill",
+        # No `address` twin in this snippet, so there is no full street address.
+        "full_address": None,
+    }
+
+
+def test_parse_detail_geo_location_object_unescaped() -> None:
+    html = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
+    )
+    geo = parse_detail_geo(html)
+    assert geo is not None
+    assert geo["source"] == "detail_location"
+    assert geo["postcode"] == "NR29 4RG"
+
+
+def test_parse_detail_geo_address_twin() -> None:
+    html = (
+        '"address":{"fullAddress":"Riverside, Martham NR29",'
+        '"latitude":52.716014,"longitude":1.614495,'
+        '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
+    )
+    geo = parse_detail_geo(html)
+    assert geo is not None
+    assert geo["source"] == "detail_address_obj"
+    assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
+    assert geo["uprn"] == "10023461458"
+    assert geo["full_address"] == "Riverside, Martham NR29"
+
+
+def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
+    # Real detail pages carry both wrappers: the `location` object holds the
+    # uprn + house number/name, the `address` twin holds the full street
+    # address. They share a uprn, so the twin's fullAddress is attached.
+    html = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"uprn":"10023461458","postalCode":"NR29 4RG",'
+        '"propertyNumberOrName":"Martham Mill"}'
+        '"address":{"fullAddress":"Riverside, Martham NR29",'
+        '"latitude":52.716014,"longitude":1.614495,'
+        '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
+    )
+    geo = parse_detail_geo(html)
+    assert geo is not None
+    assert geo["source"] == "detail_location"
+    assert geo["uprn"] == "10023461458"
+    assert geo["number_or_name"] == "Martham Mill"
+    assert geo["full_address"] == "Riverside, Martham NR29"
+
+
+def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
+    # The only `address` twin on the page belongs to a different uprn (a
+    # comparable listing). With a uprn to match on, an unrelated twin is never
+    # borrowed — full_address stays None rather than grabbing the wrong street.
+    html = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
+        '"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
+        '"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
+    )
+    geo = parse_detail_geo(html)
+    assert geo is not None
+    assert geo["uprn"] == "10023461458"
+    assert geo["full_address"] is None
+
+
+def test_parse_detail_geo_ignores_poi_coordinates() -> None:
+    # A charger POI (its coordinates NOT wrapped in a "location" object) followed
+    # by the property's own "location" wrapper. Anchoring on the wrapper means
+    # the POI's coordinates are ignored and the property's are returned.
+    poi = (
+        '"name":"Martham Community Centre","numberOfConnectors":2,'
+        '"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
+    )
+    prop = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
+    )
+    geo = parse_detail_geo(poi + prop)
+    assert geo is not None
+    assert geo["source"] == "detail_location"
+    # The property's coords win, not the community centre's.
+    assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
+    assert geo["postcode"] == "NR29 4RG"
+
+
+def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
+    # Page embeds two location objects (e.g. a comparable then the property).
+    # With a search outcode, the one in that outcode is preferred; without one,
+    # the first (document order = primary listing) is returned.
+    comparable = (
+        '"location":{"outcode":"EN2",'
+        '"coordinates":{"latitude":51.65,"longitude":-0.08},'
+        '"postalCode":"EN2 6SN"}'
+    )
+    target = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"postalCode":"NR29 4RG"}'
+    )
+    geo = parse_detail_geo(comparable + target, search_outcode="NR29")
+    assert geo is not None and geo["postcode"] == "NR29 4RG"
+    geo_first = parse_detail_geo(comparable + target)
+    assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
+
+
+def test_parse_detail_geo_rejects_out_of_england() -> None:
+    html = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":10.0,"longitude":10.0},'
+        '"uprn":"1","postalCode":"NR29 4RG"}'
+    )
+    assert parse_detail_geo(html) is None
+
+
+def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
+    # postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
+    # keep the coordinates, drop the untrustworthy postcode.
+    html = (
+        '"location":{"outcode":"NR29",'
+        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
+        '"uprn":"1","postalCode":"AB12 3CD"}'
+    )
+    geo = parse_detail_geo(html)
+    assert geo is not None
+    assert geo["lat"] == 52.716014
+    assert geo["postcode"] is None
+
+
+def test_parse_detail_geo_returns_none_for_garbage() -> None:
+    assert parse_detail_geo("<html><body>no data here</body></html>") is None
+    assert parse_detail_geo("") is None
+    # Coordinates that are not inside a property location/address wrapper (e.g.
+    # only an unwrapped POI) yield nothing — safe degradation to the outcode.
+    assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
+
+
+def _raw(**overrides) -> dict:
+    raw = {
+        "id": "123",
+        "url": "/for-sale/details/123/",
+        "address": "South Street, Bromley BR1",
+        "price": 500000,
+        "beds": 2,
+        "baths": 1,
+        "property_type": "Flat",
+    }
+    raw.update(overrides)
+    return raw
+
+
+def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
+    detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
+    result = transform_property(
+        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
+    )
+    assert result is not None
+    # Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
+    assert result["Postcode"] == "BR1 3CD"
+    assert result["Postcode source"] == "detail_address"
+    assert result["Inferred postcode"] == "BR1 2AB"
+    assert (result["lat"], result["lon"]) == (51.401, 0.011)
+
+
+def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
+    detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
+    result = transform_property(
+        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
+    )
+    assert result is not None
+    # Mismatching detail postcode is rejected in favour of the spatial value.
+    assert result["Postcode"] == "BR1 2AB"
+    assert result["Postcode source"] == "detail_coordinates"
+
+
+def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
+    detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
+    result = transform_property(
+        _raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
+    )
+    assert result is not None
+    assert result["Postcode"] == "SW1A 1AA"
+    assert result["Postcode source"] == "detail_address"
+    assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
+
+
+def test_transform_without_detail_falls_back_to_search_outcode() -> None:
+    # No detail, address has no recognizable outcode -> coarse search-outcode centroid.
+    result = transform_property(
+        _raw(address="A street with no postcode"),
+        StubPostcodeIndex(),
+        PC_COORDS,
+        search_outcode="BR1",
+        detail=None,
+    )
+    assert result is not None
+    assert result["Postcode"] == "BR1 2AB"
+    assert result["Postcode source"] == "search_outcode"
+    # No detail page -> no UPRN / house number recovered.
+    assert result["UPRN"] is None
+    assert result["Property number or name"] is None
+
+
+def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
+    detail = {
+        "lat": 51.401,
+        "lng": 0.011,
+        "postcode": "BR1 3CD",
+        "outcode": "BR1",
+        "uprn": "100023461458",
+        "number_or_name": "12",
+        "full_address": "South Street, Bromley BR1",
+    }
+    result = transform_property(
+        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
+    )
+    assert result is not None
+    assert result["UPRN"] == "100023461458"
+    assert result["Property number or name"] == "12"
+    # The detail full address replaces the outcode-level card address, and the
+    # house number is prepended for a near-exact Property Register match.
+    assert result["Listing raw address"] == "South Street, Bromley BR1"
+    assert result["Address per Property Register"] == "12, South Street, Bromley"
+
+
+def test_transform_ignores_out_of_england_detail_coords() -> None:
+    detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
+    result = transform_property(
+        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
+    )
+    assert result is not None
+    # Bad detail coords are discarded; falls through to the address outcode (BR1).
+    assert result["Postcode source"] == "address_outcode"
+    assert 49 <= result["lat"] <= 56
--- a/finder/transform.py
+++ b/finder/transform.py
@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
    return normalize_postcode(match.group(1))


+def extract_outcode(postcode: str | None) -> str | None:
+    """Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
+    if not postcode:
+        return None
+    normalized = normalize_postcode(postcode)
+    outcode = normalized.split(" ", 1)[0]
+    return outcode or None
+
+
+def resolve_listing_postcode(
+    extracted_postcode: str | None, inferred_postcode: str
+) -> tuple[str, str]:
+    """Pick the authoritative postcode for a listing, returning (postcode, source).
+
+    The address-extracted postcode is more precise than the coordinate-nearest one,
+    but it is only trustworthy when it agrees with the location: a stale, mistyped or
+    well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
+    override the spatially-correct value. Since the spatial index only supports
+    nearest-lookup, accept the extracted postcode only when its outcode matches the
+    inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
+    inferred one, which is always a real, plausibly-correct postcode.
+    """
+    if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
+        inferred_postcode
+    ):
+        return extracted_postcode, "address"
+    if extracted_postcode:
+        log.debug(
+            "Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
+            extracted_postcode,
+            inferred_postcode,
+        )
+    return inferred_postcode, "coordinates"
+
+
 def clean_listing_address(address: str | None) -> str:
    """Remove postcode/outcode suffixes from listing display addresses.

@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
    return cleaned.strip(" ,")


+def build_register_address(
+    raw_address: str | None, number_or_name: str | None = None
+) -> str:
+    """Build a Property Register-style address, prepending the house number/name.
+
+    Listing display addresses are usually street-level ("South Street, Bromley")
+    because the portals hide the exact unit. When a scraper can recover the
+    property's own number or name (e.g. Zoopla detail pages expose
+    ``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
+    carries the house identifier that the EPC/Price-Paid register addresses also
+    use — turning a fuzzy street match into a near-exact one. Falls back to the
+    plain cleaned address when no number/name is available.
+    """
+    cleaned = clean_listing_address(raw_address)
+    if not number_or_name:
+        return cleaned
+    number_or_name = number_or_name.strip()
+    if not number_or_name:
+        return cleaned
+    # Avoid duplicating a number/name the display address already starts with.
+    if cleaned.lower().startswith(number_or_name.lower()):
+        return cleaned
+    return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
+
+
 def transform_property(
-    prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
+    prop: dict,
+    outcode: str,
+    pc_index: PostcodeSpatialIndex,
+    detail_postcode: str | None = None,
 ) -> dict | None:
-    """Transform a raw Rightmove property dict into our output schema."""
+    """Transform a raw Rightmove property dict into our output schema.
+
+    ``detail_postcode`` is the property's TRUE full postcode recovered from its
+    detail page (see ``rightmove.parse_detail_postcode``); the search API itself
+    only exposes the outcode-level ``displayAddress``. When supplied and it
+    agrees with the coordinate-nearest postcode's outcode, it is preferred over
+    the coordinate guess and recorded with source ``"detail_address"``. A
+    detail postcode whose outcode disagrees with the location is discarded in
+    favour of the spatially-correct coordinate postcode, so a stale or wrong
+    detail value can never silently relocate a listing.
+    """
    loc = prop.get("location")
    if not loc:
        return None
@ -268,8 +341,25 @@ def transform_property(
        return None
    raw_address = prop.get("displayAddress", "") or ""
    extracted_postcode = extract_full_postcode(raw_address)
-    postcode = extracted_postcode or inferred_postcode
-    postcode_source = "address" if extracted_postcode else "coordinates"
+
+    # Prefer the detail page's true full postcode when it agrees with the
+    # location; otherwise fall back to the (display-address-or-coordinate) logic.
+    detail_full = extract_full_postcode(detail_postcode)
+    if detail_full and extract_outcode(detail_full) == extract_outcode(
+        inferred_postcode
+    ):
+        postcode, postcode_source = detail_full, "detail_address"
+    else:
+        if detail_full:
+            log.debug(
+                "Rejecting Rightmove detail postcode %s (outcode mismatch with "
+                "inferred %s)",
+                detail_full,
+                inferred_postcode,
+            )
+        postcode, postcode_source = resolve_listing_postcode(
+            extracted_postcode, inferred_postcode
+        )

    property_url = prop.get("propertyUrl") or ""
    if not isinstance(property_url, str):
@ -291,6 +381,9 @@ def transform_property(
        "Inferred postcode": inferred_postcode,
        "Listing raw address": raw_address,
        "Address per Property Register": clean_listing_address(raw_address),
+        # Rightmove's displayAddress is street-level; no UPRN/house number.
+        "UPRN": None,
+        "Property number or name": None,
        "Leasehold/Freehold": extract_tenure(prop.get("tenure")),
        "Property type": map_property_type(sub_type),
        "Property sub-type": normalize_sub_type(sub_type),
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -32,16 +32,24 @@ import httpx
 from constants import (
    DATA_DIR,
    DELAY_BETWEEN_PAGES,
+    GLUETUN_API_KEY,
+    GLUETUN_CONTROL_URL,
+    GLUETUN_MAX_ROTATIONS,
+    GLUETUN_PROXY,
    MAX_BEDROOMS,
    PROPERTY_TYPE_MAP,
    ZOOPLA_BASE,
+    ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
 )
 from spatial import PostcodeSpatialIndex
 from transform import (
-    clean_listing_address,
+    build_register_address,
    extract_full_postcode,
+    extract_outcode,
+    fix_coords,
    normalize_sub_type,
    parse_int_value,
+    resolve_listing_postcode,
    validate_floor_area,
 )

@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
 # cookies (bound to the previous IP), then reload and re-check the challenge.


-_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
-
-
 def _gluetun_base_url() -> str:
-    return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
+    return GLUETUN_CONTROL_URL.rstrip("/")


 def _gluetun_api_key() -> str | None:
-    return _GLUETUN_API_KEY
+    return GLUETUN_API_KEY


 def _gluetun_max_rotations() -> int:
-    raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
-    try:
-        value = int(raw)
-    except ValueError as exc:
-        raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
-    return max(value, 0)
+    return max(GLUETUN_MAX_ROTATIONS, 0)


 def _gluetun_client() -> httpx.Client:
+    # Talks to the control server directly (not through the VPN proxy).
    headers = {}
    api_key = _gluetun_api_key()
    if api_key:
@ -694,10 +695,19 @@ def launch_browser():
    profile_dir.mkdir(parents=True, exist_ok=True)
    _remove_stale_profile_locks(profile_dir)

+    # Route the browser through the Gluetun VPN proxy when configured. (geoip
+    # fingerprint alignment is intentionally not enabled: it needs the optional
+    # camoufox[geoip] extra and would spoof to the VPN exit's country, which
+    # fights the en-GB locale unless the exit is in the UK.)
+    proxy_options: dict = {}
+    if GLUETUN_PROXY:
+        proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
+
    log.info(
-        "Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
+        "Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
        headless_mode,
        profile_dir,
+        GLUETUN_PROXY or "direct",
    )
    camoufox = Camoufox(
        headless=headless_mode,
@ -705,6 +715,7 @@ def launch_browser():
        user_data_dir=str(profile_dir),
        locale=["en-GB", "en"],
        enable_cache=True,
+        **proxy_options,
    )
    raw_browser = camoufox.__enter__()
    browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
@ -926,13 +937,47 @@ def _paginate(
    page,
    total_results: int,
    max_properties: int | None = None,
+    fetch_detail=None,
+    detail_cap: int = 0,
+    detail_state: dict | None = None,
+    detail_deadline: float | None = None,
 ) -> list[dict]:
    """Extract listings from all pages of search results.

    Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
    next link when present, otherwise advance via the pn=N URL parameter while
-    the advertised result count says more listings remain."""
+    the advertised result count says more listings remain.
+
+    When ``fetch_detail`` is supplied, each listing has its detail page fetched
+    (up to ``detail_cap`` fresh loads per outcode, counted in the shared
+    ``detail_state`` dict, and only until ``detail_deadline``) and the parsed
+    geo stored under ``listing['_detail']`` for ``transform_property``. The
+    detail page is the only source of the listing's UPRN, full street address
+    and precise postcode, so it is fetched even when the search card already
+    pins a full postcode. Cached detail results are always attached but cost
+    neither a cap slot nor a delay."""
+
+    def _maybe_fetch(listing: dict) -> None:
+        if fetch_detail is None or detail_state is None:
+            return
+        url = listing.get("url", "")
+        cached = _detail_cache_key(url) in _detail_cache
+        if not cached:
+            # Fresh loads are bounded by the per-outcode cap and the wall-clock
+            # deadline so detail fetching never starves the SIGALRM budget that
+            # also guards the search pagination for this outcode.
+            if detail_state["fetched"] >= detail_cap:
+                return
+            if detail_deadline is not None and time.monotonic() >= detail_deadline:
+                return
+        listing["_detail"] = fetch_detail(url)
+        if not cached:
+            detail_state["fetched"] += 1
+            time.sleep(DELAY_BETWEEN_PAGES)
+
    all_listings = _extract_listings(page)
+    for listing in all_listings:
+        _maybe_fetch(listing)
    if max_properties is not None and len(all_listings) >= max_properties:
        return all_listings[:max_properties]

@ -984,6 +1029,7 @@ def _paginate(
            if listing["id"] not in seen_ids:
                seen_ids.add(listing["id"])
                all_listings.append(listing)
+                _maybe_fetch(listing)
                new_count += 1
                if max_properties is not None and len(all_listings) >= max_properties:
                    return all_listings[:max_properties]
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
    return None


+# ---------------------------------------------------------------------------
+# Detail-page geocoding
+# ---------------------------------------------------------------------------
+#
+# Zoopla search result cards only expose an outcode-level display address (e.g.
+# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
+# only on each listing's detail page (/for-sale/details/{id}/). The detail page
+# is a Next.js App Router route whose React Server Components flight stream
+# embeds the property's own location object, e.g.
+#   "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
+#               "uprn":"10023461458","postalCode":"NR29 4RG",...}
+# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
+#               "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
+# Nearby points of interest (stations, schools, EV chargers) and comparable
+# listings carry their own "coordinates" too, but never inside the property's
+# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
+# loose coordinates object, is what we anchor on (see parse_detail_geo).
+
+# listingId -> parsed detail dict (or None). Failures are cached too, so a
+# broken listing is not re-fetched within a run (the same listing reappears
+# across overlapping outcode searches).
+_detail_cache: dict[str, dict | None] = {}
+
+_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
+
+# The property's own location is carried by a `"location":{...}` wrapper and a
+# twin `"address":{"fullAddress":...}` widget object. We anchor on those
+# wrappers (and capture their full object body, which contains exactly one
+# nested object — `coordinates`) rather than scanning for loose coordinate
+# objects: nearby points of interest (stations/schools/EV chargers) and
+# comparable/"similar" listings also embed coordinates, but never inside the
+# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
+# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
+_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
+_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
+_DETAIL_COORDS_IN_BODY_RE = re.compile(
+    r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
+)
+_DETAIL_LATLNG_IN_BODY_RE = re.compile(
+    r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
+)
+_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
+# The location object spells it "postalCode"; the address twin uses "postcode".
+_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
+# The UPRN (Unique Property Reference Number) appears in both the location and
+# address objects and is the linchpin for an exact listing->EPC join (EPC open
+# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
+# (e.g. "12", "Martham Mill") only in the location object.
+_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
+_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
+
+
+def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
+    """Extract the property's own coordinates/postcode from a Zoopla detail page.
+
+    Pure and browser-free: the live browser only produces the HTML string
+    (``page.content()``); this does the parsing so it is unit-testable.
+
+    Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
+    "number_or_name", "full_address"}`` (every field except the coordinates may
+    be ``None``) or ``None`` when no property location wrapper is found. The
+    ``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
+    number/name) and ``full_address`` give a register-style address for the
+    Price Paid join.
+    Coordinates are bounds-checked to England and a postcode is kept only when
+    it agrees with its own object's outcode. ``search_outcode``, when given, is
+    used only as a tie-break to pick the right ``location`` object on pages that
+    also embed comparable listings. See module docstring for the data model."""
+    if not html:
+        return None
+
+    # RSC flight strings are embedded as escaped JS string literals, so quotes
+    # and slashes arrive escaped; normalize them so the regexes match.
+    buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
+
+    def in_england(lat: float, lng: float) -> tuple[float, float] | None:
+        lat, lng = fix_coords(lat, lng)
+        if 49 <= lat <= 56 and -7 <= lng <= 2:
+            return lat, lng
+        return None
+
+    def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
+        # outcode and postcode are read from the SAME object body as the coords,
+        # so the postcode is self-consistent; drop it only if it somehow isn't.
+        outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
+        outcode = outcode_match.group(1) if outcode_match else None
+        postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
+        postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
+        if postcode and outcode and extract_outcode(postcode) != outcode.upper():
+            postcode = None
+        uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
+        number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
+        number_or_name = number_match.group(1).strip() if number_match else None
+        return {
+            "lat": coords[0],
+            "lng": coords[1],
+            "postcode": postcode,
+            "outcode": outcode,
+            "source": source,
+            "uprn": uprn_match.group(1) if uprn_match else None,
+            "number_or_name": number_or_name or None,
+            "full_address": full_address,
+        }
+
+    def attach_full_address(result: dict | None) -> dict | None:
+        # The house-numbered street address lives in the `address` map-widget
+        # twin, not the `location` wrapper we anchor coordinates on. Pull it from
+        # the twin that shares this property's uprn; when there is no uprn to
+        # disambiguate, fall back to the first twin (document order = primary
+        # listing), but never guess a twin when a uprn exists and none matches —
+        # that would risk grabbing a comparable listing's address.
+        if result is None or result.get("full_address"):
+            return result
+        target = result.get("uprn")
+        first = None
+        for match in _DETAIL_ADDRESS_RE.finditer(buf):
+            full_address = match.group(1) or None
+            if full_address is None:
+                continue
+            if first is None:
+                first = full_address
+            uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
+            if target and uprn_match and uprn_match.group(1) == target:
+                result["full_address"] = full_address
+                return result
+        if target is None:
+            result["full_address"] = first
+        return result
+
+    # Strategy 1 — the property's own `location` wrapper (authoritative). Take
+    # the first match (the primary listing precedes any comparables in the
+    # flight stream), but prefer one whose outcode matches the searched outcode.
+    first_location = None
+    for match in _DETAIL_LOCATION_RE.finditer(buf):
+        body = match.group(1)
+        coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
+        if not coords_match:
+            continue
+        coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
+        if not coords:
+            continue
+        candidate = build(body, coords, "detail_location")
+        if first_location is None:
+            first_location = candidate
+        if (
+            search_outcode
+            and candidate["outcode"]
+            and candidate["outcode"].upper() == search_outcode.upper()
+        ):
+            return attach_full_address(candidate)
+    if first_location is not None:
+        return attach_full_address(first_location)
+
+    # Strategy 2 — the `address` map-widget twin (same coordinates, backup).
+    for match in _DETAIL_ADDRESS_RE.finditer(buf):
+        full_address = match.group(1) or None
+        body = match.group(2)
+        latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
+        if not latlng_match:
+            continue
+        coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
+        if coords:
+            return build(body, coords, "detail_address_obj", full_address=full_address)
+
+    return None
+
+
+def _detail_cache_key(listing_url: str) -> str:
+    """Cache key for a listing detail page — its numeric id when present."""
+    id_match = _LISTING_ID_RE.search(listing_url)
+    return id_match.group(1) if id_match else listing_url
+
+
+def _fetch_listing_detail(
+    detail_page,
+    listing_url: str,
+    search_outcode: str | None = None,
+) -> dict | None:
+    """Load a listing detail page and return its parsed geo dict (or None).
+
+    Results (including failures) are cached by listingId. Ordinary navigation
+    and extraction errors are swallowed so the caller can fall back to
+    outcode-level resolution, but TurnstileError is allowed to propagate so the
+    scraper's "Cloudflare ends the run" contract still holds. The goto timeout
+    is kept short so one slow detail page can't eat the per-outcode budget."""
+    cache_key = _detail_cache_key(listing_url)
+    if cache_key in _detail_cache:
+        return _detail_cache[cache_key]
+
+    url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
+    result: dict | None = None
+    try:
+        detail_page.goto(
+            url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
+        )
+        _ensure_not_challenged(detail_page)
+        html = detail_page.content()
+        result = parse_detail_geo(html, search_outcode=search_outcode)
+    except TurnstileError:
+        raise
+    except Exception as exc:
+        log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
+        result = None
+
+    _detail_cache[cache_key] = result
+    return result
+
+
 def _map_property_type(raw_type: str | None) -> str:
    """Map Zoopla property type text to canonical type."""
    if not raw_type:
@ -1109,28 +1363,64 @@ def transform_property(
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    search_outcode: str | None = None,
+    detail: dict | None = None,
 ) -> dict | None:
    """Transform a raw Zoopla listing dict into the standard output schema.

-    Zoopla search cards do not include coordinates, so we resolve lat/lng
-    from postcodes extracted from the address text."""
+    Zoopla search cards only expose an outcode-level address, so precise
+    location comes from the listing's detail page (see ``parse_detail_geo`` /
+    ``_fetch_listing_detail``), passed in as ``detail``. When detail-page
+    coordinates are available we resolve the nearest postcode via the spatial
+    index — mirroring rightmove/onthemarket — and only fall back to the coarse
+    outcode centroid when no detail location could be obtained."""
    price = parse_int_value(raw.get("price")) or 0

    address = raw.get("address", "") or ""

-    # Resolve postcode and coordinates from address
    extracted_postcode = extract_full_postcode(address)
-    postcode = extracted_postcode
-    postcode_source = "address" if extracted_postcode else None
+    detail = detail or {}
+    detail_postcode = extract_full_postcode(detail.get("postcode"))
+    # Detail-page address fields: the UPRN keys an exact EPC join, and the
+    # full street address / house number-or-name beat the outcode-level card
+    # address for the Price-Paid join. All three are absent unless the detail
+    # page was fetched, so every consumer must tolerate None.
+    detail_uprn = detail.get("uprn") or None
+    detail_full_address = detail.get("full_address") or None
+    detail_number_or_name = detail.get("number_or_name") or None
+
+    postcode = postcode_source = inferred_postcode = None
    lat = lng = None

-    if postcode:
-        coords = pc_coords.get(postcode)
-        if coords:
-            lat, lng = coords
+    # (A) Best: detail-page coordinates -> nearest postcode (authoritative).
+    detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
+    if detail_lat is not None and detail_lng is not None:
+        fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
+        if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
+            nearest = pc_index.nearest(fixed_lat, fixed_lng)
+            if nearest:
+                lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
+                candidate = detail_postcode or extracted_postcode
+                postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
+                postcode_source = (
+                    "detail_address"
+                    if resolved_source == "address"
+                    else "detail_coordinates"
+                )

+    # (B) Detail-page postcode without usable coordinates -> geocode it.
+    if lat is None and detail_postcode and detail_postcode in pc_coords:
+        lat, lng = pc_coords[detail_postcode]
+        postcode = inferred_postcode = detail_postcode
+        postcode_source = "detail_address"
+
+    # (C) Full postcode in the search-card address -> geocode it.
+    if lat is None and extracted_postcode and extracted_postcode in pc_coords:
+        lat, lng = pc_coords[extracted_postcode]
+        postcode = extracted_postcode
+        postcode_source = "address"
+
+    # (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
    if lat is None:
-        # Try outcode-level fallback from address text
        addr_outcode = _extract_outcode(address)
        if addr_outcode:
            result = _resolve_outcode_coords(addr_outcode, pc_coords)
@ -1138,7 +1428,6 @@ def transform_property(
                postcode, lat, lng = result
                postcode_source = "address_outcode"

-    # Final fallback: use the outcode we know we're searching
    if lat is None and search_outcode:
        result = _resolve_outcode_coords(search_outcode, pc_coords)
        if result:
@ -1188,9 +1477,17 @@ def transform_property(
        "Postcode": postcode,
        "Postcode source": postcode_source or "unknown",
        "Extracted postcode": extracted_postcode,
-        "Inferred postcode": postcode if postcode_source != "address" else None,
-        "Listing raw address": address,
-        "Address per Property Register": clean_listing_address(address),
+        "Inferred postcode": (
+            inferred_postcode
+            if inferred_postcode is not None
+            else (postcode if postcode_source != "address" else None)
+        ),
+        "Listing raw address": detail_full_address or address,
+        "Address per Property Register": build_register_address(
+            detail_full_address or address, detail_number_or_name
+        ),
+        "UPRN": detail_uprn,
+        "Property number or name": detail_number_or_name,
        "Leasehold/Freehold": raw.get("tenure") or None,
        "Property type": _map_property_type(raw.get("property_type")),
        "Property sub-type": normalize_sub_type(raw.get("property_type")),
@ -1215,6 +1512,9 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    max_properties: int | None = None,
+    detail_page=None,
+    detail_cap: int = 0,
+    detail_budget_seconds: float | None = None,
 ) -> tuple[list[dict], str | None]:
    """Search Zoopla for properties in one outcode.

@ -1222,6 +1522,12 @@ def search_outcode(
    search flow, extracts listings from rendered DOM, and transforms to the
    standard output schema.

+    When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
+    are supplied, up to ``detail_cap`` listings per outcode have their detail
+    page fetched for a precise postcode (see ``_fetch_listing_detail``).
+    ``detail_budget_seconds`` caps the wall-clock time spent fetching details so
+    the per-outcode timeout that also guards search pagination is never starved.
+
    Returns (properties, search_url).

    Raises TurnstileError if Cloudflare blocks us mid-session.
@ -1231,12 +1537,25 @@ def search_outcode(

    total_results = _get_result_count(page)

+    fetch_detail = None
+    detail_deadline = None
+    if detail_page is not None and detail_cap > 0:
+        fetch_detail = lambda url: _fetch_listing_detail(  # noqa: E731
+            detail_page, url, search_outcode=outcode
+        )
+        if detail_budget_seconds is not None:
+            detail_deadline = time.monotonic() + detail_budget_seconds
+
    # Always try extraction even if result count is 0 — the count regex may
    # not match Zoopla's current text format, but listings may still be in DOM
    raw_listings = _paginate(
        page,
        total_results,
        max_properties=max_properties,
+        fetch_detail=fetch_detail,
+        detail_cap=detail_cap,
+        detail_state={"fetched": 0},
+        detail_deadline=detail_deadline,
    )
    if not raw_listings:
        if total_results > 0:
@ -1252,7 +1571,11 @@ def search_outcode(
    for raw in raw_listings:
        try:
            transformed = transform_property(
-                raw, pc_index, pc_coords, search_outcode=outcode
+                raw,
+                pc_index,
+                pc_coords,
+                search_outcode=outcode,
+                detail=raw.get("_detail"),
            )
        except Exception as exc:
            log.warning(
--- a/finder/zoopla_flaresolverr.py
+++ b/finder/zoopla_flaresolverr.py
@ -0,0 +1,164 @@
+"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
+
+FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
+still contains the React Server Components flight stream — so the existing pure
+parsers work unchanged:
+  - the search page yields the outcode's listing detail URLs, and
+  - each detail page's flight stream carries the property's location object
+    (postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
+    listing fields (price/beds/baths/tenure/floor area) parsed here.
+
+Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
+FlareSolverr session solves the SW9 search + detail pages and the flight data
+is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
+
+This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
+path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
+"""
+
+import logging
+import re
+import time
+
+from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
+from flaresolverr import FlareSolverrError, FlareSolverrSession
+from spatial import PostcodeSpatialIndex
+from zoopla import _url_with_page, parse_detail_geo, transform_property
+
+log = logging.getLogger("zoopla")
+
+# Safety bound on how many search-result pages to walk per outcode.
+_MAX_SERP_PAGES = 60
+
+_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
+_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
+
+
+def _int(pattern: str, buf: str) -> int | None:
+    match = re.search(pattern, buf)
+    return int(match.group(1)) if match else None
+
+
+def parse_detail_listing(html: str) -> dict:
+    """Extract the non-location listing fields from a Zoopla detail page.
+
+    Mirrors the fields the Camoufox SERP-card extractor produced, read from the
+    detail page's flight stream (validated against real Zoopla detail HTML).
+    All fields are best-effort; missing ones default to None so a listing with
+    a known location is still emitted."""
+    buf = html.replace('\\"', '"').replace("\\/", "/")
+
+    price = _int(r'"internalValue":(\d+)', buf)
+    if price is None:
+        price = _int(r'"priceUnformatted":(\d+)', buf)
+
+    tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
+    tenure = tenure_match.group(1).title() if tenure_match else None
+
+    # Address + property type come from the page <title>, e.g.
+    # "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
+    address = None
+    property_type = None
+    title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
+    if title_match:
+        title = title_match.group(1)
+        addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
+        if addr_match:
+            address = addr_match.group(1).strip()
+        type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
+        if type_match:
+            property_type = type_match.group(1).strip()
+    explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
+    if explicit_type:
+        property_type = explicit_type.group(1)
+
+    return {
+        "price": price,
+        "beds": _int(r'"numBedrooms":(\d+)', buf),
+        "baths": _int(r'"numBaths":(\d+)', buf),
+        "receptions": _int(r'"numLivingRooms":(\d+)', buf),
+        "floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
+        "tenure": tenure,
+        "property_type": property_type,
+        "address": address,
+    }
+
+
+def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
+    """Walk the outcode's search-result pages and collect listing detail paths."""
+    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
+    seen: list[str] = []
+    seen_ids: set[str] = set()
+    for page_num in range(1, _MAX_SERP_PAGES + 1):
+        url = base if page_num == 1 else _url_with_page(base, page_num)
+        html = fs.get(url)
+        new = 0
+        for path in _DETAIL_PATH_RE.findall(html):
+            id_match = _LISTING_ID_RE.search(path)
+            listing_id = id_match.group(1) if id_match else path
+            if listing_id in seen_ids:
+                continue
+            seen_ids.add(listing_id)
+            seen.append(path)
+            new += 1
+            if limit is not None and len(seen) >= limit:
+                return seen
+        if new == 0:
+            break
+        time.sleep(DELAY_BETWEEN_PAGES)
+    return seen
+
+
+def search_outcode(
+    outcode: str,
+    pc_index: PostcodeSpatialIndex,
+    pc_coords: dict[str, tuple[float, float]],
+    fs: FlareSolverrSession,
+    max_properties: int | None = None,
+    detail_cap: int = 0,
+    detail_budget_seconds: float | None = None,
+) -> tuple[list[dict], str | None]:
+    """Scrape one outcode via FlareSolverr. Returns (properties, search_url).
+
+    Every listing's detail page is fetched (that is where the postcode lives),
+    so the effective listing count is bounded by both ``max_properties`` and
+    ``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
+    limit = detail_cap if detail_cap and detail_cap > 0 else None
+    if max_properties is not None:
+        limit = max_properties if limit is None else min(limit, max_properties)
+
+    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
+    paths = _enumerate_detail_paths(fs, outcode, limit)
+    if not paths:
+        return [], base
+
+    deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
+    properties: list[dict] = []
+    dropped = 0
+    for path in paths:
+        if deadline is not None and time.monotonic() >= deadline:
+            log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
+            break
+        id_match = _LISTING_ID_RE.search(path)
+        listing_id = id_match.group(1) if id_match else path
+        try:
+            html = fs.get(ZOOPLA_BASE + path)
+            geo = parse_detail_geo(html, search_outcode=outcode)
+            raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
+            prop = transform_property(
+                raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
+            )
+        except FlareSolverrError as exc:
+            log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
+            prop = None
+        except Exception as exc:  # noqa: BLE001 - never let one listing kill the outcode
+            log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
+            prop = None
+        if prop:
+            properties.append(prop)
+        else:
+            dropped += 1
+        time.sleep(DELAY_BETWEEN_PAGES)
+
+    log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
+    return properties, base