scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/.gitignore
+++ b/.gitignore
@ -22,6 +22,8 @@ video/auth.*
 *.jpeg
 *.mp4
 **/*.log
 r5-java/tmp
 property-data
 property-data2
--- a/finder/Dockerfile
+++ b/finder/Dockerfile
@ -0,0 +1,25 @@
 # Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
 # network namespace; the source tree is bind-mounted at runtime, so this image
 # only needs the Python deps. The venv lives OUTSIDE the bind-mount target
 # (/opt/venv) so the mount doesn't shadow it.
 FROM python:3.12-slim
 ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
    UV_COMPILE_BYTECODE=1 \
    UV_LINK_MODE=copy \
    PYTHONUNBUFFERED=1
 RUN apt-get update \
  && apt-get install -y --no-install-recommends ca-certificates curl \
  && rm -rf /var/lib/apt/lists/*
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 WORKDIR /app/finder
 # Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
 COPY pyproject.toml uv.lock ./
 RUN uv sync --no-install-project --frozen
 # Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
 CMD ["sleep", "infinity"]
--- a/finder/constants.py
+++ b/finder/constants.py
@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent
 DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
 ARCGIS_PATH = Path(
-    os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
+    os.environ.get(
        "ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
    )
 )
 PAGE_SIZE = 24
 DELAY_BETWEEN_PAGES = 0.3
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20  # sanity cap — values above this are almost certainly parsi
 TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
 SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
 RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
 # Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
 # propertyData.address.{outcode,incode}, which together form the property's TRUE
 # full postcode — the search API only exposes the outcode. {id} is the numeric
 # listing id from the search response.
 RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
 # The Rightmove search API gives only an outcode-level display address, so the
 # true full postcode is recovered from each listing's detail page (see
 # finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
 # big throughput increase over the ~1000-result-per-outcode search, so detail
 # fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
 RIGHTMOVE_FETCH_DETAILS = True  # fetch detail pages for true per-listing postcodes
 RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000  # max detail-page fetches per outcode
 # OnTheMarket
 ONTHEMARKET_BASE = "https://www.onthemarket.com"
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
 # Zoopla
 ZOOPLA_BASE = "https://www.zoopla.co.uk"
 # Zoopla search cards only carry an outcode-level address, so the full postcode
 # and precise coordinates are scraped from each listing's detail page. These
 # bound that extra work (see finder/zoopla.py and finder/scraper.py).
 ZOOPLA_FETCH_DETAILS = True  # fetch detail pages for precise per-listing postcodes
 ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000  # max detail-page fetches per outcode
 ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000  # per detail-page navigation timeout
 # Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
 # spent fetching details; the remainder is reserved for search pagination so
 # detail fetches can never trip the timeout and discard collected listings.
 ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
 # Gluetun VPN. Network endpoints are env-overridable because they are
 # deployment-specific: when finder runs in a SEPARATE container they use the
 # `gluetun` hostname (defaults below); when finder SHARES gluetun's network
 # namespace (docker-compose.yml, network_mode container:media_gluetun) they
 # become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
 # all traffic, so no HTTP proxy is needed).
 #   GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
 GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
 GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
 GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
 # Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
 # rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
 # rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
 # Raise only with residential IPs where rotation helps.
 GLUETUN_MAX_ROTATIONS = 0  # max egress-IP rotations per Cloudflare challenge
 # Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
 # sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
 # RSC flight stream with postcode + coordinates; "camoufox" drives a local
 # anti-fingerprint browser (needs an interactive solve on datacenter IPs).
 ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
 FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
 FLARESOLVERR_MAX_TIMEOUT_MS = 120000  # per-request solve budget; first solve is slow
 # Greater London-ish postcode areas. This intentionally uses broad area
 # prefixes so a manual scrape can include central/inner London plus common
 # outer-London and near-London outcodes without maintaining a long borough list.
--- a/finder/docker-compose.yml
+++ b/finder/docker-compose.yml
@ -0,0 +1,57 @@
 # Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
 # container's network namespace. Everything egresses through the VPN, and
 # FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
 #
 # Prerequisites:
 #   - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
 #     It is managed by a different compose; it is referenced here as external
 #     via network_mode "container:media_gluetun".
 #   - Because these services share gluetun's netns, they reach each other and
 #     gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
 #     NO published ports (which is exactly why this avoids the dev-container
 #     port-forwarding pain).
 #
 # Usage:
 #   cd finder
 #   docker compose up -d --build flaresolverr finder     # start the sidecars
 #   docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
 #   docker compose exec finder uv run python main.py --source all       # full run
 #   docker compose down
 #
 # NOTE: a manually-started `finder_flaresolverr` container from testing must be
 # removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
 services:
  flaresolverr:
    image: ghcr.io/flaresolverr/flaresolverr:latest
    container_name: finder_flaresolverr
    network_mode: "container:media_gluetun"
    environment:
      LOG_LEVEL: info
      TZ: Europe/London
    restart: unless-stopped
  finder:
    build:
      context: .
      dockerfile: Dockerfile
    image: finder-scraper:latest
    container_name: finder_scraper
    network_mode: "container:media_gluetun"
    depends_on:
      - flaresolverr
    volumes:
      - .:/app/finder                          # live-mounted finder source
      - ../property-data:/app/property-data:ro  # ARCGIS postcode data
    working_dir: /app/finder
    environment:
      # Shared netns: sidecars are on localhost, and the netns already tunnels
      # all traffic through the VPN, so no HTTP proxy is used.
      ZOOPLA_FETCHER: flaresolverr
      FLARESOLVERR_URL: http://localhost:8191/v1
      GLUETUN_CONTROL_URL: http://localhost:8000
      GLUETUN_PROXY: ""            # empty => direct (shared netns already tunnels)
      DATA_DIR: /app/finder/data
      ARCGIS_PATH: /app/property-data/arcgis_data.parquet
    restart: "no"
    command: ["sleep", "infinity"]   # stays up; run scrapes via `docker compose exec`
--- a/finder/flaresolverr.py
+++ b/finder/flaresolverr.py
@ -0,0 +1,91 @@
 """FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
 FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
 undetected browser to pass Cloudflare's challenge and returns the fully
 rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
 the Gluetun VPN network namespace, so its browser egresses through the VPN.
 Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
 provided a reused session and a generous maxTimeout (~120s) — the first
 challenge solve is slow, subsequent requests on the warm session are fast.
 """
 import logging
 import httpx
 from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
 log = logging.getLogger("flaresolverr")
 class FlareSolverrError(Exception):
    """Raised when FlareSolverr cannot fetch/solve a URL."""
 class FlareSolverrSession:
    """A reusable FlareSolverr browser session (context manager).
    Reusing one session keeps the cleared Cloudflare cookies warm across
    requests, so only the first fetch pays the full challenge-solve cost."""
    def __init__(
        self,
        url: str = FLARESOLVERR_URL,
        session: str = "finder",
        max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
    ) -> None:
        self._url = url
        self._session = session
        self._max_timeout = max_timeout_ms
        # Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
        # for up to maxTimeout while solving before responding).
        self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
        self._active = False
    def _post(self, payload: dict) -> dict:
        try:
            resp = self._client.post(self._url, json=payload)
            resp.raise_for_status()
            data = resp.json()
        except (httpx.HTTPError, ValueError) as exc:
            raise FlareSolverrError(
                f"FlareSolverr request to {self._url} failed: {exc}"
            ) from exc
        if data.get("status") != "ok":
            raise FlareSolverrError(
                f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
            )
        return data
    def __enter__(self) -> "FlareSolverrSession":
        # Start from a clean session (ignore destroy errors for a fresh name).
        try:
            self._post({"cmd": "sessions.destroy", "session": self._session})
        except FlareSolverrError:
            pass
        self._post({"cmd": "sessions.create", "session": self._session})
        self._active = True
        log.info("FlareSolverr session %r ready at %s", self._session, self._url)
        return self
    def get(self, url: str) -> str:
        """Fetch a URL through FlareSolverr; return the solved HTML."""
        data = self._post(
            {
                "cmd": "request.get",
                "session": self._session,
                "url": url,
                "maxTimeout": self._max_timeout,
            }
        )
        solution = data.get("solution") or {}
        return solution.get("response", "") or ""
    def __exit__(self, *exc_info) -> None:
        if self._active:
            try:
                self._post({"cmd": "sessions.destroy", "session": self._session})
            except FlareSolverrError as exc:
                log.debug("FlareSolverr session destroy failed: %s", exc)
        self._client.close()
--- a/finder/gdal-ecw/Dockerfile
+++ b/finder/gdal-ecw/Dockerfile
@ -0,0 +1,53 @@
 # GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
 # Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
 #
 # EA VAP ships as ECW **v2** rasters, which are readable by the open-source
 # libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
 # with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
 # ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
 # restrictions entirely.
 #
 # We build only the ECW driver as a GDAL *plugin* on top of the official runtime
 # image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
 # commit reported by the base image so libgdal and the plugin stay ABI-compatible.
 #
 # Build:  docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
 # Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
 FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
 ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
 RUN apt-get update && apt-get install -y --no-install-recommends \
        cmake g++ make git curl ca-certificates \
    && rm -rf /var/lib/apt/lists/*
 # Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
 RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
    && tar -C / -xzf /tmp/libecwj2.tar.gz \
    && rm -f /tmp/libecwj2.tar.gz \
    && (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
            ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
        done) \
    && ldconfig
 # Build the ECW driver plugin against the base image's exact GDAL sources.
 RUN set -eux; \
    GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
    test -n "$GDAL_COMMIT"; \
    echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
    mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
    git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
    git checkout -q FETCH_HEAD; \
    cmake -S frmts/ecw -B /tmp/ecw-build \
        -DCMAKE_BUILD_TYPE=Release \
        -DCMAKE_PREFIX_PATH=/usr \
        -DECW_ROOT=/opt/libecwj2-3.3; \
    cmake --build /tmp/ecw-build -j"$(nproc)"; \
    PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
    mkdir -p "$PLUGIN_DIR"; \
    find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
    rm -rf /tmp/gdal /tmp/ecw-build
 # Fail the build if the driver is not actually available.
 RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"
--- a/finder/http_client.py
+++ b/finder/http_client.py
@ -5,7 +5,7 @@ import time
 import httpx
 from fake_useragent import UserAgent
-from constants import MAX_RETRIES, RETRY_BASE_DELAY
+from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY
 log = logging.getLogger("rightmove")
@ -15,10 +15,12 @@ _ua = UserAgent(
 def make_client() -> httpx.Client:
    # Route through the Gluetun HTTP proxy (VPN egress) when configured.
    return httpx.Client(
        timeout=30,
        headers={"User-Agent": _ua.random, "Accept": "application/json"},
        follow_redirects=True,
        proxy=GLUETUN_PROXY or None,
    )
--- a/finder/main.py
+++ b/finder/main.py
@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
        default=DATA_DIR,
        help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
    )
    parser.add_argument(
        "--outcodes",
        type=str,
        default=None,
        help=(
            "Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
            "instead of the full Greater London set. Must fall within the "
            "London-ish areas; takes precedence over --test/--limit-outcodes."
        ),
    )
    parser.add_argument(
        "--limit-outcodes",
        type=int,
@ -116,17 +126,32 @@ def main() -> int:
    from scraper import (
        build_postcode_coords,
        build_postcode_index,
        filter_londonish_outcodes,
        load_outcodes,
        run_scrape,
    )
-    outcodes = load_outcodes()
+    if args.outcodes is not None:
-    if args.test and args.limit_outcodes is None:
+        requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
-        preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
+        if not requested:
-        if preferred:
+            raise SystemExit("--outcodes was empty")
-            outcodes = preferred
+        outcodes = filter_londonish_outcodes(requested)
-    if args.limit_outcodes is not None:
+        dropped = sorted(set(requested) - set(outcodes))
-        outcodes = outcodes[: args.limit_outcodes]
+        if dropped:
            log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
        if not outcodes:
            raise SystemExit(
                "None of the requested outcodes are within the Greater London-ish areas "
                f"({', '.join(requested)})."
            )
    else:
        outcodes = load_outcodes()
        if args.test and args.limit_outcodes is None:
            preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
            if preferred:
                outcodes = preferred
        if args.limit_outcodes is not None:
            outcodes = outcodes[: args.limit_outcodes]
    if not outcodes:
        raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
--- a/finder/onthemarket.py
+++ b/finder/onthemarket.py
@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
 `humanised-property-type`, `features` (a list where the first element is
 typically `"Tenure: <value>"`), and `details-url`. Pagination is via
 `?page=N`; the loop terminates when `paginationControls.next` is null.
 Postcodes
 ---------
 The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
 London, SE5") and a map pin, so the old behaviour derived the postcode from the
 nearest postcode to that pin — a guess that frequently lands on a neighbouring
 unit (the pin can sit on the wrong side of a street boundary).
 Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
 embeds the property's analytics dataLayer at
 `props.initialReduxState.metadata.dataLayer`, which carries the property's own
 `postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
 `property-id`. Crucially this is NOT the agent's office postcode — that lives
 separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and
 is the classic trap when blindly scanning the page for a postcode. We read the
 dataLayer postcode, verify `property-id` matches the listing, and accept it only
 when its outcode agrees with the coordinate-nearest postcode (via
 ``resolve_listing_postcode``) — exactly the trust rule the other scrapers use.
 Measured over a sample of real listings this yields a trustworthy, usually
 exact-unit postcode for ~11/12 listings; the rest safely fall back to the
 coordinate-nearest postcode.
 Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
 ``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
 """
 import json
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
 from transform import (
    clean_listing_address,
    extract_full_postcode,
    extract_outcode,
    fix_coords,
    map_property_type,
    normalize_sub_type,
    parse_display_size,
    resolve_listing_postcode,
 )
 log = logging.getLogger("rightmove")
 # Detail-page postcode recovery (see module docstring). When enabled, each
 # listing's detail page is fetched so its analytics dataLayer postcode — the
 # property's own full unit postcode — can replace the coordinate-nearest guess.
 # Bounded per outcode so a large outcode can't balloon into unbounded extra
 # HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
 # typical outcode's listings all get their real postcode rather than a
 # coordinate-nearest guess.
 OTM_FETCH_DETAILS = True
 OTM_MAX_DETAILS_PER_OUTCODE = 400
 _NEXT_DATA_RE = re.compile(
    r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
    re.DOTALL,
@ -51,6 +87,11 @@ _HTML_HEADERS = {
    "Accept-Language": "en-GB,en;q=0.9",
 }
 # listingId -> recovered full postcode (or None). Failures are cached too so a
 # broken or postcode-less detail page is not re-fetched within a run (the same
 # listing can reappear across overlapping outcode searches).
 _detail_postcode_cache: dict[str, str | None] = {}
 def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
    """GET one search-results page and return the embedded __NEXT_DATA__ JSON.
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
    return None
 def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
    """Extract the property's own full postcode from an OnTheMarket detail page.
    Pure and network-free so it is unit-testable: callers pass `page.content()`
    / the GET body and this does the parsing.
    The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
    ``props.initialReduxState.metadata.dataLayer.postcode`` and is the
    property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
    agent's office postcode, which sits separately at
    ``…property.agent.postcode`` — the trap when scanning a detail page for "a"
    postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
    match it, guaranteeing we read this listing's postcode and not a stray one.
    Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
    page has no usable property postcode. Trust (outcode-vs-coordinates
    agreement) is enforced later in ``transform_property``.
    """
    if not html:
        return None
    match = _NEXT_DATA_RE.search(html)
    if not match:
        return None
    try:
        data = json.loads(match.group(1))
    except json.JSONDecodeError:
        return None
    try:
        data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
    except (KeyError, TypeError):
        return None
    if not isinstance(data_layer, dict):
        return None
    # Guard against reading a different listing's postcode: the dataLayer is the
    # property's own analytics payload, so its property-id must match.
    if listing_id is not None:
        page_id = data_layer.get("property-id")
        if page_id is not None and str(page_id) != str(listing_id):
            return None
    raw_postcode = data_layer.get("postcode")
    if not isinstance(raw_postcode, str):
        return None
    return extract_full_postcode(raw_postcode)
 def _fetch_detail_postcode(
    client: httpx.Client, details_url: str, listing_id: str
 ) -> str | None:
    """GET one listing's detail page and return its dataLayer postcode (or None).
    Results (including failures) are cached by listing id so a listing that
    reappears across overlapping outcode searches is fetched at most once. Plain
    HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network /
    parse errors degrade gracefully to None so the caller falls back to the
    coordinate-nearest postcode.
    """
    if listing_id in _detail_postcode_cache:
        return _detail_postcode_cache[listing_id]
    full_url = (
        ONTHEMARKET_BASE + details_url
        if details_url and not details_url.startswith("http")
        else details_url
    )
    result: str | None = None
    if full_url:
        for attempt in range(MAX_RETRIES):
            try:
                resp = client.get(
                    full_url, headers=_HTML_HEADERS, follow_redirects=True
                )
            except (
                httpx.ConnectError,
                httpx.ReadTimeout,
                httpx.WriteTimeout,
                httpx.PoolTimeout,
            ) as exc:
                delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
                log.warning(
                    "%s from %s, retry %d/%d in %.1fs",
                    type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
                )
                time.sleep(delay)
                continue
            if resp.status_code == 200:
                result = parse_detail_postcode(resp.text, listing_id)
                break
            if resp.status_code in (429, 500, 502, 503, 504):
                delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
                log.warning(
                    "HTTP %d from %s, retry %d/%d in %.1fs",
                    resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
                )
                time.sleep(delay)
                continue
            log.debug(
                "OnTheMarket detail %s returned HTTP %d (no postcode)",
                listing_id, resp.status_code,
            )
            break
    _detail_postcode_cache[listing_id] = result
    return result
 def _parse_price(price_value) -> int:
    """Parse a formatted price string like '£450,000' into an integer.
    Returns 0 for POA/auction/null values."""
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:
 def transform_property(
-    raw: dict, pc_index: PostcodeSpatialIndex
+    raw: dict,
    pc_index: PostcodeSpatialIndex,
    detail_postcode: str | None = None,
 ) -> dict | None:
-    """Transform a raw OnTheMarket listing dict into our output schema."""
+    """Transform a raw OnTheMarket listing dict into our output schema.
    ``detail_postcode`` is the property's own full postcode recovered from its
    detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
    or ``None`` when no detail fetch was done / no postcode was found. When
    present and trustworthy (its outcode agrees with the coordinate-nearest
    postcode) it supersedes the coordinate guess and is labelled
    ``"detail_address"``.
    """
    loc = raw.get("location") or {}
    raw_lat = loc.get("lat")
    raw_lng = loc.get("lon")
@ -184,8 +345,29 @@ def transform_property(
        return None
    raw_address = raw.get("address", "") or ""
    extracted_postcode = extract_full_postcode(raw_address)
-    postcode = extracted_postcode or inferred_postcode
+
-    postcode_source = "address" if extracted_postcode else "coordinates"
+    # Prefer the property's own detail-page postcode when we have one and it is
    # trustworthy. The detail postcode is a full unit postcode (better than the
    # coordinate-nearest guess and than the usually outcode-only card address),
    # but a stale/mislabelled value would silently override the spatially
    # correct one, so apply the same outcode-agreement trust rule the address
    # postcode uses: keep it only when its outcode matches the
    # coordinate-nearest postcode's outcode.
    detail_postcode = extract_full_postcode(detail_postcode)
    if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
        inferred_postcode
    ):
        postcode, postcode_source = detail_postcode, "detail_address"
    else:
        if detail_postcode:
            log.debug(
                "OnTheMarket %s: rejecting detail postcode %s "
                "(outcode mismatch with inferred %s)",
                raw.get("id", "?"), detail_postcode, inferred_postcode,
            )
        postcode, postcode_source = resolve_listing_postcode(
            extracted_postcode, inferred_postcode
        )
    raw_beds = raw.get("bedrooms") or 0
    raw_baths = raw.get("bathrooms") or 0
@ -223,6 +405,10 @@ def transform_property(
        "Inferred postcode": inferred_postcode,
        "Listing raw address": raw_address,
        "Address per Property Register": clean_listing_address(raw_address),
        # OnTheMarket search JSON exposes only a street-level address; no UPRN
        # or house number/name is available without a detail-page fetch.
        "UPRN": None,
        "Property number or name": None,
        "Leasehold/Freehold": _extract_tenure(features),
        "Property type": map_property_type(sub_type),
        "Property sub-type": normalize_sub_type(sub_type),
@ -242,10 +428,17 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
 ) -> list[dict]:
-    """Paginate through OnTheMarket sale results for one outcode."""
+    """Paginate through OnTheMarket sale results for one outcode.
    When ``OTM_FETCH_DETAILS`` is enabled, up to
    ``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
    fetched for the property's own postcode (see ``_fetch_detail_postcode``);
    the rest fall back to the coordinate-nearest postcode.
    """
    properties: list[dict] = []
    seen_ids: set[str] = set()
    page_num = 1
    details_fetched = 0
    while True:
        data = _fetch_page_json(client, outcode, page_num)
@ -269,8 +462,22 @@ def search_outcode(
            if listing_id and listing_id in seen_ids:
                continue
            seen_ids.add(listing_id)
            detail_postcode = None
            if OTM_FETCH_DETAILS and listing_id:
                # Cached lookups are free; only fresh GETs count toward the cap
                # and incur the inter-request delay.
                cached = listing_id in _detail_postcode_cache
                if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
                    detail_postcode = _fetch_detail_postcode(
                        client, raw.get("details-url") or "", listing_id
                    )
                    if not cached:
                        details_fetched += 1
                        time.sleep(DELAY_BETWEEN_PAGES)
            try:
-                transformed = transform_property(raw, pc_index)
+                transformed = transform_property(raw, pc_index, detail_postcode)
            except Exception as exc:
                log.warning(
                    "OnTheMarket %s property %s failed to transform: %s",
--- a/finder/rightmove.py
+++ b/finder/rightmove.py
@ -1,4 +1,6 @@
 import json
 import logging
 import re
 import time
 import httpx
@ -6,12 +8,15 @@ import httpx
 from constants import (
    PAGE_SIZE,
    DELAY_BETWEEN_PAGES,
    RIGHTMOVE_DETAIL_URL,
    RIGHTMOVE_FETCH_DETAILS,
    RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    SEARCH_URL,
    TYPEAHEAD_URL,
 )
 from http_client import fetch_with_retry
 from spatial import PostcodeSpatialIndex
-from transform import transform_property
+from transform import extract_full_postcode, normalize_postcode, transform_property
 log = logging.getLogger("rightmove")
@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
 _MAX_INDEX = 1008
 # ---------------------------------------------------------------------------
 # Detail-page postcode extraction
 # ---------------------------------------------------------------------------
 #
 # The search API (_paginate) only returns an outcode-level `displayAddress`
 # (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
 # listing's detail page, however, embeds the property's OWN full postcode in a
 # `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
 # (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
 # `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
 # This is the property's own postcode, NOT a nearest station/school: the
 # `nearestStations`/`nearestAirports` arrays carry only names + distances, no
 # postcodes, and the address outcode always matches the searched outcode.
 # Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
 # new-builds) found the full postcode present 100% of the time. There is no
 # UPRN or house-number field anywhere in propertyData, so those stay None.
 #
 # __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
 # a JSON STRING holding a flat array where every integer inside a container is
 # an index reference into that same array (so the graph can dedupe). We
 # brace-match the (large, deeply-nested) object literal — a non-greedy regex
 # cannot — then rehydrate the reference graph before reading the address.
 _PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
 def _extract_page_model_literal(html: str) -> str | None:
    """Return the `{...}` object literal assigned to window.__PAGE_MODEL.
    Brace-matches with string/escape awareness so embedded braces and quotes in
    string values don't end the match early. Returns None when absent."""
    marker = _PAGE_MODEL_RE.search(html)
    if not marker:
        return None
    start = marker.end()
    if start >= len(html) or html[start] != "{":
        return None
    depth = 0
    in_str = False
    esc = False
    for j in range(start, len(html)):
        ch = html[j]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        elif ch == '"':
            in_str = True
        elif ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return html[start : j + 1]
    return None
 def _rehydrate(flat: list) -> object:
    """Resolve a devalue-style flattened reference array into a nested object.
    Index 0 is the root; every int inside a dict/list is an index back into
    ``flat``. Memoised so shared/cyclic references resolve once."""
    cache: dict[int, object] = {}
    def resolve(idx: int) -> object:
        if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
            return None
        if idx in cache:
            return cache[idx]
        node = flat[idx]
        if isinstance(node, dict):
            out: dict = {}
            cache[idx] = out
            for key, value in node.items():
                out[key] = resolve(value) if isinstance(value, int) else value
            return out
        if isinstance(node, list):
            arr: list = []
            cache[idx] = arr
            for value in node:
                arr.append(resolve(value) if isinstance(value, int) else value)
            return arr
        cache[idx] = node
        return node
    return resolve(0)
 def parse_detail_postcode(html: str) -> str | None:
    """Extract a Rightmove property's TRUE full postcode from its detail HTML.
    Pure and network-free so it is unit-testable: callers pass the page HTML.
    Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
    and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
    page has no parseable address (the property location wrapper can be empty —
    the caller then keeps the coordinate fallback). The returned outcode is
    re-validated against the joined postcode so a malformed incode is dropped.
    """
    if not html:
        return None
    literal = _extract_page_model_literal(html)
    if not literal:
        return None
    try:
        outer = json.loads(literal)
        flat = json.loads(outer["data"])
    except (ValueError, KeyError, TypeError):
        return None
    if not isinstance(flat, list) or not flat:
        return None
    root = _rehydrate(flat)
    if not isinstance(root, dict):
        return None
    property_data = root.get("propertyData")
    if not isinstance(property_data, dict):
        return None
    address = property_data.get("address")
    if not isinstance(address, dict):
        return None
    outcode = address.get("outcode")
    incode = address.get("incode")
    if not isinstance(outcode, str) or not isinstance(incode, str):
        return None
    outcode, incode = outcode.strip(), incode.strip()
    if not outcode or not incode:
        return None
    # Round-trip through the shared postcode validator/normaliser: this both
    # canonicalises spacing and rejects an outcode/incode pair that doesn't form
    # a structurally-valid UK postcode.
    return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
 # listingId -> true full postcode (or None when unavailable). Failures are
 # cached too, so a broken/duplicate listing is fetched at most once per run (the
 # same listing can reappear across overlapping outcode searches).
 _detail_postcode_cache: dict[str, str | None] = {}
 def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
    """GET a listing detail page and return its true full postcode (or None).
    Results (including failures) are cached by listing id. The detail page is a
    plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
    suffices; any error degrades gracefully to the coordinate fallback."""
    if not property_id:
        return None
    if property_id in _detail_postcode_cache:
        return _detail_postcode_cache[property_id]
    postcode: str | None = None
    url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
    try:
        resp = client.get(url, headers={"Accept": "text/html"})
        if resp.status_code == 200:
            postcode = parse_detail_postcode(resp.text)
        else:
            log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
    except httpx.HTTPError as exc:
        log.debug("Rightmove detail fetch failed %s: %s", url, exc)
    _detail_postcode_cache[property_id] = postcode
    return postcode
 def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
    """Look up Rightmove's internal ID for an outcode via typeahead API."""
    if outcode in outcode_cache:
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
    return None
 def _detail_postcode_for(
    client: httpx.Client,
    prop: dict,
    fetch_details: bool,
    detail_budget: dict,
 ) -> str | None:
    """Look up a listing's true postcode, honouring the per-outcode fetch cap.
    Cached listings are always served (they cost neither a cap slot nor a GET);
    a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
    if not fetch_details:
        return None
    property_id = str(prop.get("id") or "")
    if not property_id:
        return None
    if property_id in _detail_postcode_cache:
        return _detail_postcode_cache[property_id]
    if detail_budget["remaining"] <= 0:
        return None
    detail_budget["remaining"] -= 1
    postcode = _fetch_detail_postcode(client, property_id)
    time.sleep(DELAY_BETWEEN_PAGES)
    return postcode
 def _paginate(
    client: httpx.Client,
    outcode_id: str,
@ -51,11 +251,19 @@ def _paginate(
    channel_cfg: dict,
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
    fetch_details: bool = False,
    detail_cap: int = 0,
 ) -> tuple[list[dict], int]:
-    """Paginate through search results. Returns (properties, result_count)."""
+    """Paginate through search results. Returns (properties, result_count).
    When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
    their detail page fetched for the property's TRUE full postcode (see
    ``parse_detail_postcode``); the rest fall back to coordinate-derived
    postcodes."""
    properties = []
    index = 0
    result_count = 0
    detail_budget = {"remaining": detail_cap}
    while True:
        params = {
@ -82,7 +290,12 @@ def _paginate(
        for prop in raw_props:
            try:
-                transformed = transform_property(prop, outcode, pc_index)
+                detail_postcode = _detail_postcode_for(
                    client, prop, fetch_details, detail_budget
                )
                transformed = transform_property(
                    prop, outcode, pc_index, detail_postcode=detail_postcode
                )
            except Exception as exc:
                log.warning(
                    "Rightmove %s/%s property %s failed to transform: %s",
@ -127,7 +340,12 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
 ) -> list[dict]:
-    """Paginate through unfiltered sale results for one outcode+channel."""
+    """Paginate through unfiltered sale results for one outcode+channel.
    Each listing's detail page is fetched for the property's TRUE full postcode
    (gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
    ``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
    coordinate-derived postcode."""
    properties, _ = _paginate(
        client,
        outcode_id,
@ -135,6 +353,8 @@ def search_outcode(
        channel_cfg,
        pc_index,
        max_properties=max_properties,
        fetch_details=RIGHTMOVE_FETCH_DETAILS,
        detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    )
    if max_properties is not None and len(properties) >= max_properties:
--- a/finder/scraper.py
+++ b/finder/scraper.py
@ -15,6 +15,10 @@ from constants import (
    DATA_DIR,
    DELAY_BETWEEN_OUTCODES,
    LONDON_OUTCODE_PREFIXES,
    ZOOPLA_DETAIL_BUDGET_FRACTION,
    ZOOPLA_FETCH_DETAILS,
    ZOOPLA_FETCHER,
    ZOOPLA_MAX_DETAILS_PER_OUTCODE,
 )
 from http_client import make_client
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
    return timeout
 def _zoopla_detail_cap() -> int:
    """Max detail-page fetches per outcode (0 disables detail fetching).
    Zoopla search cards only expose an outcode-level address, so the full
    postcode/coordinates come from each listing's detail page. The cap bounds
    the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
    (the per-outcode SIGALRM budget covers the detail fetches too). Configure via
    ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
    return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
 def _open_zoopla_detail_tab(page, detail_cap: int):
    """Open a second tab on the same context for detail-page fetches.
    Sharing the persistent context means the detail tab inherits the search
    tab's Cloudflare clearance cookies. Returns None when detail fetching is
    disabled or the tab cannot be created (the scrape then degrades to
    outcode-level postcodes rather than failing)."""
    if detail_cap <= 0:
        return None
    try:
        return page.context.new_page()
    except Exception as exc:
        log.warning(
            "Zoopla detail tab unavailable (%s); using outcode-level postcodes",
            _exception_detail(exc),
        )
        return None
@contextmanager
 def _wall_clock_timeout(seconds: int, label: str):
    """SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
        log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
 def _scrape_zoopla_flaresolverr(
    outcodes: list[str],
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    results: dict[str, list[dict]],
    errors: list[str],
    max_properties_per_source: int | None,
 ) -> None:
    """Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
    from flaresolverr import FlareSolverrError, FlareSolverrSession
    from zoopla_flaresolverr import search_outcode as fs_search_outcode
    try:
        session = FlareSolverrSession(session="zoopla")
        session.__enter__()
    except FlareSolverrError as exc:
        errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
        log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
        return
    try:
        for outcode in outcodes:
            remaining = _source_remaining(results, "zoopla", max_properties_per_source)
            if remaining == 0:
                log.info("Zoopla cap reached")
                return
            try:
                props, _ = fs_search_outcode(
                    outcode,
                    pc_index,
                    pc_coords,
                    session,
                    max_properties=remaining,
                    detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
                )
                added = _store_properties(results, "zoopla", props, max_properties_per_source)
                log.info("Zoopla %s: +%d", outcode, added)
            except Exception as exc:  # noqa: BLE001 - one outcode must not kill the run
                _record_error(errors, "zoopla", outcode, exc)
            time.sleep(DELAY_BETWEEN_OUTCODES)
    finally:
        session.__exit__(None, None, None)
 def _scrape_zoopla(
    outcodes: list[str],
    pc_index: PostcodeSpatialIndex,
@ -446,6 +524,12 @@ def _scrape_zoopla(
    errors: list[str],
    max_properties_per_source: int | None,
 ) -> None:
    if ZOOPLA_FETCHER == "flaresolverr":
        _scrape_zoopla_flaresolverr(
            outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
        )
        return
    try:
        browser, page = _launch_zoopla_with_retries()
    except Exception as exc:
@ -454,6 +538,12 @@ def _scrape_zoopla(
        return
    outcode_timeout = _zoopla_outcode_timeout_seconds()
    detail_cap = _zoopla_detail_cap()
    detail_page = _open_zoopla_detail_tab(page, detail_cap)
    # Spend at most a fraction of each outcode's budget on detail fetches so the
    # SIGALRM guard never trips mid-outcode and discards already-collected
    # search listings; the rest is left for search pagination and transform.
    detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
    try:
        for outcode in outcodes:
@ -470,6 +560,9 @@ def _scrape_zoopla(
                            pc_index,
                            pc_coords,
                            max_properties=None,
                            detail_page=detail_page,
                            detail_cap=detail_cap,
                            detail_budget_seconds=detail_budget_seconds,
                        )
                    added = _store_properties(
                        results,
@ -496,6 +589,8 @@ def _scrape_zoopla(
                    _close_zoopla_browser(browser, f"zoopla {outcode}")
                    try:
                        browser, page = _launch_zoopla_with_retries()
                        # The old context (and its detail tab) is gone; reopen one.
                        detail_page = _open_zoopla_detail_tab(page, detail_cap)
                        log.info("Zoopla %s retrying with fresh browser", outcode)
                    except Exception as relaunch_exc:
                        _record_error(errors, "zoopla", outcode, relaunch_exc)
@ -503,6 +598,11 @@ def _scrape_zoopla(
            time.sleep(DELAY_BETWEEN_OUTCODES)
    finally:
        if detail_page is not None:
            try:
                detail_page.close()
            except Exception:
                pass
        _close_zoopla_browser(browser, "zoopla final")
--- a/finder/storage.py
+++ b/finder/storage.py
@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
            "Address per Property Register": [
                p["Address per Property Register"] for p in properties
            ],
            # UPRN (when the scraper recovered it) keys an exact listing->EPC
            # join; Property number or name is the house identifier for the
            # Price-Paid address join. Both are None for sources/listings without
            # a detail-page fetch.
            "UPRN": [p.get("UPRN") for p in properties],
            "Property number or name": [
                p.get("Property number or name") for p in properties
            ],
            "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
            "Property type": [p["Property type"] for p in properties],
            "Property sub-type": [p["Property sub-type"] for p in properties],
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
            "Inferred postcode": pl.Utf8,
            "Listing raw address": pl.Utf8,
            "Address per Property Register": pl.Utf8,
            "UPRN": pl.Utf8,
            "Property number or name": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Property type": pl.Utf8,
            "Property sub-type": pl.Utf8,
--- a/finder/test_onthemarket.py
+++ b/finder/test_onthemarket.py
@ -0,0 +1,206 @@
 """Tests for the OnTheMarket scraper's detail-page postcode recovery.
 `parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
 or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
 detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
 the property's own postcode lives in the analytics dataLayer
 (`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
 postcode sits separately under `…property.agent.postcode` — the trap we must not
 fall into.
 """
 import json
 import onthemarket
 from onthemarket import parse_detail_postcode, transform_property
 class _StubIndex:
    """Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
    def __init__(self, postcode: str | None):
        self._postcode = postcode
    def nearest(self, lat: float, lng: float) -> str | None:
        return self._postcode
 def _detail_html(
    *,
    property_id: int = 19522441,
    datalayer_postcode: str = "SE5 9AA",
    agent_postcode: str = "SE5 8RS",
 ) -> str:
    """Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
    next_data = {
        "props": {
            "initialReduxState": {
                "metadata": {
                    "dataLayer": {
                        "page-type": "details-section",
                        "property-type": "homes",
                        # The property's own unit postcode.
                        "postcode": datalayer_postcode,
                        "property-id": property_id,
                        "price": "275,000",
                        "addressline_2": "Padfield Road",
                    }
                },
                "property": {
                    "displayAddress": "Padfield Road, London, SE5",
                    "location": {"lon": -0.100233, "lat": 51.466129},
                    # The agent block carries the AGENT'S office postcode — the
                    # trap. parse_detail_postcode must not return this.
                    "agent": {
                        "address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
                        "postcode": agent_postcode,
                    },
                },
            }
        }
    }
    payload = json.dumps(next_data)
    return (
        "<html><body>"
        '<script id="__NEXT_DATA__" type="application/json">'
        f"{payload}"
        "</script></body></html>"
    )
 # ---------------------------------------------------------------------------
 # parse_detail_postcode
 # ---------------------------------------------------------------------------
 def test_parse_returns_property_postcode_not_agent():
    html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
    assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
 def test_parse_normalizes_spacing():
    html = _detail_html(datalayer_postcode="se59aa")
    assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
 def test_parse_ignores_mismatched_property_id():
    # dataLayer postcode belongs to property 19522441; asking for a different
    # listing id must refuse to return it.
    html = _detail_html(property_id=19522441)
    assert parse_detail_postcode(html, "99999999") is None
 def test_parse_accepts_when_no_listing_id_given():
    html = _detail_html(datalayer_postcode="SE5 9AA")
    assert parse_detail_postcode(html, None) == "SE5 9AA"
 def test_parse_handles_missing_postcode():
    html = _detail_html(datalayer_postcode="")
    assert parse_detail_postcode(html, "19522441") is None
 def test_parse_handles_no_next_data():
    assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
 def test_parse_handles_empty_html():
    assert parse_detail_postcode("", "1") is None
 def test_parse_handles_malformed_json():
    html = (
        '<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
    )
    assert parse_detail_postcode(html, "1") is None
 def test_parse_handles_missing_datalayer():
    next_data = {"props": {"initialReduxState": {"metadata": {}}}}
    html = (
        '<script id="__NEXT_DATA__" type="application/json">'
        f"{json.dumps(next_data)}</script>"
    )
    assert parse_detail_postcode(html, "1") is None
 # ---------------------------------------------------------------------------
 # transform_property — detail postcode wiring + trust rule
 # ---------------------------------------------------------------------------
 _RAW_LISTING = {
    "id": "19522441",
    "address": "Padfield Road, London, SE5",
    "location": {"lon": -0.100233, "lat": 51.466129},
    "bedrooms": 2,
    "bathrooms": 1,
    "price": "£275,000",
    "humanised-property-type": "Apartment",
    "features": ["Tenure: Leasehold (99 years remaining)"],
    "details-url": "/details/19522441/",
 }
 def test_transform_uses_trusted_detail_postcode():
    # Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
    # the (more precise) detail postcode and label it detail_address.
    index = _StubIndex("SE5 1AA")
    out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
    assert out is not None
    assert out["Postcode"] == "SE5 9AA"
    assert out["Postcode source"] == "detail_address"
 def test_transform_rejects_detail_postcode_on_outcode_mismatch():
    # Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
    # outcode -> reject the detail postcode, fall back to coordinate logic.
    index = _StubIndex("SE5 1AA")
    out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
    assert out is not None
    assert out["Postcode"] == "SE5 1AA"
    assert out["Postcode source"] == "coordinates"
 def test_transform_without_detail_postcode_uses_coordinates():
    index = _StubIndex("SE5 1AA")
    out = transform_property(_RAW_LISTING, index, detail_postcode=None)
    assert out is not None
    assert out["Postcode"] == "SE5 1AA"
    assert out["Postcode source"] == "coordinates"
    # No UPRN / house number is recoverable from OnTheMarket.
    assert out["UPRN"] is None
    assert out["Property number or name"] is None
 def test_transform_detail_postcode_via_search_address_outcode():
    # When the card address already carries a full postcode that agrees with the
    # coordinates, the existing "address" source still wins absent a detail
    # postcode — detail recovery never regresses that path.
    raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
    index = _StubIndex("SE5 1AA")
    out = transform_property(raw, index, detail_postcode=None)
    assert out["Postcode"] == "SE5 1AA"
    assert out["Postcode source"] == "address"
 # ---------------------------------------------------------------------------
 # _fetch_detail_postcode caching (no real network)
 # ---------------------------------------------------------------------------
 def test_fetch_detail_postcode_is_cached(monkeypatch):
    onthemarket._detail_postcode_cache.clear()
    onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
    def _boom(*args, **kwargs):  # pragma: no cover - must never be called
        raise AssertionError("network was hit despite a cached value")
    # Any httpx use would explode; the cache hit must short-circuit first.
    result = onthemarket._fetch_detail_postcode(
        client=type("C", (), {"get": _boom})(),
        details_url="/details/19522441/",
        listing_id="19522441",
    )
    assert result == "SE5 9AA"
    onthemarket._detail_postcode_cache.clear()
--- a/finder/test_rightmove.py
+++ b/finder/test_rightmove.py
@ -0,0 +1,113 @@
 """Tests for the Rightmove detail-page postcode extractor.
 The search API only returns an outcode-level ``displayAddress``; the property's
 TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
 ``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
 it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
 graph whose ``data`` field is a JSON STRING of a flat array where every integer
 inside a container is an index reference into that same array.
 """
 import json
 from rightmove import _extract_page_model_literal, parse_detail_postcode
 def _page_model_html(flat: list, *, encoding: str = "json") -> str:
    """Wrap a flattened object-graph array in a realistic detail-page <script>.
    Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
    where the array is itself JSON-encoded (so its quotes arrive escaped)."""
    outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
    return (
        "<html><head></head><body>\n"
        "<script>\n"
        "    window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
        "</script>\n"
        "</body></html>"
    )
 # A faithful slice of a real listing: root -> propertyData -> address, with a
 # decoy nearestStations array (which carries NO postcodes on the live page) to
 # prove the parser anchors on the property's own address, not a nearby POI.
 _FLAT_SW9 = [
    {"propertyData": 1},  # 0: root
    {
        "id": "89089584",
        "address": 2,
        "location": 4,
        "nearestStations": 6,
    },  # 1: propertyData
    {
        "displayAddress": "Caldwell Street, Stockwell",
        "countryCode": "GB",
        "ukCountry": "England",
        "outcode": "SW9",
        "incode": "0HD",
    },  # 2: address
    None,  # 3: filler
    {
        "latitude": 51.477238,
        "longitude": -0.116819,
        "pinType": "ACCURATE_POINT",
    },  # 4: location
    None,  # 5: filler
    [7, 8],  # 6: nearestStations (references)
    {"name": "Oval Station", "distance": 0.36},  # 7: station, no postcode
    {"name": "Stockwell Station", "distance": 0.41},  # 8: station, no postcode
 ]
 def test_parses_full_postcode_from_outcode_and_incode() -> None:
    html = _page_model_html(_FLAT_SW9)
    assert parse_detail_postcode(html) == "SW9 0HD"
 def test_extract_page_model_literal_brace_matches_nested_object() -> None:
    # The literal must include the whole nested object, not stop at the first
    # closing brace inside the escaped data string.
    html = _page_model_html(_FLAT_SW9)
    literal = _extract_page_model_literal(html)
    assert literal is not None
    assert literal.startswith("{") and literal.endswith("}")
    # Round-trips back to a dict with the expected top-level keys.
    assert set(json.loads(literal)) == {"data", "encoding"}
 def test_normalises_unspaced_incode() -> None:
    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
    flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
    assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
 def test_returns_none_when_address_missing() -> None:
    # The location wrapper can be empty/absent on some listings; the caller then
    # keeps the coordinate fallback, so we must return None (not raise).
    flat = [
        {"propertyData": 1},
        {"id": "1", "location": 2},
        {"latitude": 51.5, "longitude": -0.1},
    ]
    assert parse_detail_postcode(_page_model_html(flat)) is None
 def test_returns_none_when_incode_blank() -> None:
    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
    flat[2] = {**_FLAT_SW9[2], "incode": ""}
    assert parse_detail_postcode(_page_model_html(flat)) is None
 def test_returns_none_for_non_postcode_pair() -> None:
    # A structurally-invalid outcode/incode pair is rejected by the validator.
    flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
    flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
    assert parse_detail_postcode(_page_model_html(flat)) is None
 def test_returns_none_without_page_model() -> None:
    assert parse_detail_postcode("") is None
    assert parse_detail_postcode("<html><body>no model</body></html>") is None
    # Malformed JSON in the data field degrades gracefully.
    broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
    assert parse_detail_postcode(broken) is None
--- a/finder/test_transform.py
+++ b/finder/test_transform.py
@ -1,13 +1,19 @@
 from transform import (
    build_register_address,
    clean_listing_address,
    extract_full_postcode,
    extract_outcode,
    resolve_listing_postcode,
    transform_property,
 )
 class StubPostcodeIndex:
    def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
        self._postcode = postcode
    def nearest(self, lat: float, lng: float) -> str:
-        return "SW1A 9ZZ"
+        return self._postcode
 def test_extract_full_postcode_normalizes_spacing() -> None:
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
    assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
 def test_build_register_address_prepends_house_number_or_name() -> None:
    # House number/name prepended, with the trailing outcode/postcode stripped.
    assert (
        build_register_address("South Street, Bromley BR1", "12")
        == "12, South Street, Bromley"
    )
    assert (
        build_register_address("Riverside, Martham NR29", "Martham Mill")
        == "Martham Mill, Riverside, Martham"
    )
    # No number/name -> identical to the plain cleaned address.
    assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
    # Already starts with the number/name -> no duplication.
    assert (
        build_register_address("12 South Street, Bromley", "12")
        == "12 South Street, Bromley"
    )
    # Empty/whitespace number/name is ignored.
    assert build_register_address("Kings Avenue, Bromley", "  ") == "Kings Avenue, Bromley"
 def test_extract_outcode() -> None:
    assert extract_outcode("SW1A 2AA") == "SW1A"
    assert extract_outcode("n4 2ha") == "N4"
    assert extract_outcode("SW1A2AA") == "SW1A"
    assert extract_outcode(None) is None
    assert extract_outcode("") is None
 def test_resolve_listing_postcode() -> None:
    # Outcode matches -> trust the more precise extracted postcode.
    assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
    # Outcode mismatch -> fall back to the spatially-correct inferred postcode.
    assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
    # Well-formed but fabricated postcode in a different outcode is rejected.
    assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
    # No extracted postcode -> inferred is authoritative.
    assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
 def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
    prop = {
        "id": "123",
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
    assert result["Inferred postcode"] == "SW1A 9ZZ"
    assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
    assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
 def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
    prop = {
        "id": "124",
        "location": {"latitude": 51.5, "longitude": -0.1},
        "price": {"amount": 750000, "displayPrices": []},
        "propertySubType": "Terraced",
        "bedrooms": 3,
        "bathrooms": 1,
        "keyFeatures": [],
        "propertyUrl": "/properties/124",
        # Address postcode is in a different outcode than the coordinate-nearest one.
        "displayAddress": "10 Downing Street, E14 9SS",
    }
    result = transform_property(prop, "SW1A", StubPostcodeIndex())
    assert result is not None
    # The spatially-correct inferred postcode wins over the mismatching extracted one.
    assert result["Postcode"] == "SW1A 9ZZ"
    assert result["Postcode source"] == "coordinates"
    assert result["Extracted postcode"] == "E14 9SS"
 def _rightmove_prop() -> dict:
    return {
        "id": "200",
        "location": {"latitude": 51.5, "longitude": -0.1},
        "price": {"amount": 750000, "displayPrices": []},
        "propertySubType": "Terraced",
        "bedrooms": 3,
        "bathrooms": 1,
        "keyFeatures": [],
        "propertyUrl": "/properties/200",
        # Search API only ever exposes the outcode in the display address.
        "displayAddress": "Caldwell Street, Stockwell, SW9",
    }
 def test_rightmove_transform_prefers_detail_postcode() -> None:
    # The detail page's true full postcode (same outcode as the location) is
    # preferred over the coordinate-nearest guess.
    result = transform_property(
        _rightmove_prop(),
        "SW9",
        StubPostcodeIndex("SW9 7AA"),
        detail_postcode="SW9 0HD",
    )
    assert result is not None
    assert result["Postcode"] == "SW9 0HD"
    assert result["Postcode source"] == "detail_address"
    # The coordinate inference is still surfaced separately.
    assert result["Inferred postcode"] == "SW9 7AA"
 def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
    # A detail postcode whose outcode disagrees with the location must not
    # relocate the listing; the coordinate postcode wins instead.
    result = transform_property(
        _rightmove_prop(),
        "SW9",
        StubPostcodeIndex("SW9 7AA"),
        detail_postcode="E14 9SS",
    )
    assert result is not None
    assert result["Postcode"] == "SW9 7AA"
    assert result["Postcode source"] == "coordinates"
 def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
    # No detail postcode -> behaviour is unchanged (coordinate-nearest).
    result = transform_property(
        _rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
    )
    assert result is not None
    assert result["Postcode"] == "SW9 7AA"
    assert result["Postcode source"] == "coordinates"
--- a/finder/test_zoopla.py
+++ b/finder/test_zoopla.py
@ -0,0 +1,288 @@
 from zoopla import _detail_cache_key, parse_detail_geo, transform_property
 def test_detail_cache_key_uses_listing_id() -> None:
    assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
    assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
    # No id in the URL -> fall back to the URL itself as the key.
    assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
 class StubPostcodeIndex:
    """Spatial index stub whose nearest-lookup returns a fixed postcode."""
    def __init__(self, postcode: str = "BR1 2AB") -> None:
        self._postcode = postcode
    def nearest(self, lat: float, lng: float) -> str:
        return self._postcode
 # London-ish postcodes with coordinates, plus the Norfolk sample used by the
 # verified detail-page snippet (well inside the England bounds check).
 PC_COORDS = {
    "BR1 2AB": (51.40, 0.01),
    "SW1A 1AA": (51.50, -0.14),
    "NR29 4RG": (52.716014, 1.614495),
 }
 # Verified RSC `location` object (listing 59888978), as it appears escaped inside
 # a self.__next_f flight chunk in page.content().
 _LOCATION_ESCAPED = (
    '<script>self.__next_f.push([1,"...'
    '\\"location\\":{\\"outcode\\":\\"NR29\\",'
    '\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
    '\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
    '\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
    '..."])</script>'
 )
 def test_parse_detail_geo_location_object_escaped() -> None:
    geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
    assert geo == {
        "lat": 52.716014,
        "lng": 1.614495,
        "postcode": "NR29 4RG",
        "outcode": "NR29",
        "source": "detail_location",
        "uprn": "10023461458",
        "number_or_name": "Martham Mill",
        # No `address` twin in this snippet, so there is no full street address.
        "full_address": None,
    }
 def test_parse_detail_geo_location_object_unescaped() -> None:
    html = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
    )
    geo = parse_detail_geo(html)
    assert geo is not None
    assert geo["source"] == "detail_location"
    assert geo["postcode"] == "NR29 4RG"
 def test_parse_detail_geo_address_twin() -> None:
    html = (
        '"address":{"fullAddress":"Riverside, Martham NR29",'
        '"latitude":52.716014,"longitude":1.614495,'
        '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
    )
    geo = parse_detail_geo(html)
    assert geo is not None
    assert geo["source"] == "detail_address_obj"
    assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
    assert geo["uprn"] == "10023461458"
    assert geo["full_address"] == "Riverside, Martham NR29"
 def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
    # Real detail pages carry both wrappers: the `location` object holds the
    # uprn + house number/name, the `address` twin holds the full street
    # address. They share a uprn, so the twin's fullAddress is attached.
    html = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"uprn":"10023461458","postalCode":"NR29 4RG",'
        '"propertyNumberOrName":"Martham Mill"}'
        '"address":{"fullAddress":"Riverside, Martham NR29",'
        '"latitude":52.716014,"longitude":1.614495,'
        '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
    )
    geo = parse_detail_geo(html)
    assert geo is not None
    assert geo["source"] == "detail_location"
    assert geo["uprn"] == "10023461458"
    assert geo["number_or_name"] == "Martham Mill"
    assert geo["full_address"] == "Riverside, Martham NR29"
 def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
    # The only `address` twin on the page belongs to a different uprn (a
    # comparable listing). With a uprn to match on, an unrelated twin is never
    # borrowed — full_address stays None rather than grabbing the wrong street.
    html = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
        '"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
        '"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
    )
    geo = parse_detail_geo(html)
    assert geo is not None
    assert geo["uprn"] == "10023461458"
    assert geo["full_address"] is None
 def test_parse_detail_geo_ignores_poi_coordinates() -> None:
    # A charger POI (its coordinates NOT wrapped in a "location" object) followed
    # by the property's own "location" wrapper. Anchoring on the wrapper means
    # the POI's coordinates are ignored and the property's are returned.
    poi = (
        '"name":"Martham Community Centre","numberOfConnectors":2,'
        '"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
    )
    prop = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"uprn":"10023461458","postalCode":"NR29 4RG"}'
    )
    geo = parse_detail_geo(poi + prop)
    assert geo is not None
    assert geo["source"] == "detail_location"
    # The property's coords win, not the community centre's.
    assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
    assert geo["postcode"] == "NR29 4RG"
 def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
    # Page embeds two location objects (e.g. a comparable then the property).
    # With a search outcode, the one in that outcode is preferred; without one,
    # the first (document order = primary listing) is returned.
    comparable = (
        '"location":{"outcode":"EN2",'
        '"coordinates":{"latitude":51.65,"longitude":-0.08},'
        '"postalCode":"EN2 6SN"}'
    )
    target = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"postalCode":"NR29 4RG"}'
    )
    geo = parse_detail_geo(comparable + target, search_outcode="NR29")
    assert geo is not None and geo["postcode"] == "NR29 4RG"
    geo_first = parse_detail_geo(comparable + target)
    assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
 def test_parse_detail_geo_rejects_out_of_england() -> None:
    html = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":10.0,"longitude":10.0},'
        '"uprn":"1","postalCode":"NR29 4RG"}'
    )
    assert parse_detail_geo(html) is None
 def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
    # postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
    # keep the coordinates, drop the untrustworthy postcode.
    html = (
        '"location":{"outcode":"NR29",'
        '"coordinates":{"latitude":52.716014,"longitude":1.614495},'
        '"uprn":"1","postalCode":"AB12 3CD"}'
    )
    geo = parse_detail_geo(html)
    assert geo is not None
    assert geo["lat"] == 52.716014
    assert geo["postcode"] is None
 def test_parse_detail_geo_returns_none_for_garbage() -> None:
    assert parse_detail_geo("<html><body>no data here</body></html>") is None
    assert parse_detail_geo("") is None
    # Coordinates that are not inside a property location/address wrapper (e.g.
    # only an unwrapped POI) yield nothing — safe degradation to the outcode.
    assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
 def _raw(**overrides) -> dict:
    raw = {
        "id": "123",
        "url": "/for-sale/details/123/",
        "address": "South Street, Bromley BR1",
        "price": 500000,
        "beds": 2,
        "baths": 1,
        "property_type": "Flat",
    }
    raw.update(overrides)
    return raw
 def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
    detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
    result = transform_property(
        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
    )
    assert result is not None
    # Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
    assert result["Postcode"] == "BR1 3CD"
    assert result["Postcode source"] == "detail_address"
    assert result["Inferred postcode"] == "BR1 2AB"
    assert (result["lat"], result["lon"]) == (51.401, 0.011)
 def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
    detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
    result = transform_property(
        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
    )
    assert result is not None
    # Mismatching detail postcode is rejected in favour of the spatial value.
    assert result["Postcode"] == "BR1 2AB"
    assert result["Postcode source"] == "detail_coordinates"
 def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
    detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
    result = transform_property(
        _raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
    )
    assert result is not None
    assert result["Postcode"] == "SW1A 1AA"
    assert result["Postcode source"] == "detail_address"
    assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
 def test_transform_without_detail_falls_back_to_search_outcode() -> None:
    # No detail, address has no recognizable outcode -> coarse search-outcode centroid.
    result = transform_property(
        _raw(address="A street with no postcode"),
        StubPostcodeIndex(),
        PC_COORDS,
        search_outcode="BR1",
        detail=None,
    )
    assert result is not None
    assert result["Postcode"] == "BR1 2AB"
    assert result["Postcode source"] == "search_outcode"
    # No detail page -> no UPRN / house number recovered.
    assert result["UPRN"] is None
    assert result["Property number or name"] is None
 def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
    detail = {
        "lat": 51.401,
        "lng": 0.011,
        "postcode": "BR1 3CD",
        "outcode": "BR1",
        "uprn": "100023461458",
        "number_or_name": "12",
        "full_address": "South Street, Bromley BR1",
    }
    result = transform_property(
        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
    )
    assert result is not None
    assert result["UPRN"] == "100023461458"
    assert result["Property number or name"] == "12"
    # The detail full address replaces the outcode-level card address, and the
    # house number is prepended for a near-exact Property Register match.
    assert result["Listing raw address"] == "South Street, Bromley BR1"
    assert result["Address per Property Register"] == "12, South Street, Bromley"
 def test_transform_ignores_out_of_england_detail_coords() -> None:
    detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
    result = transform_property(
        _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
    )
    assert result is not None
    # Bad detail coords are discarded; falls through to the address outcode (BR1).
    assert result["Postcode source"] == "address_outcode"
    assert 49 <= result["lat"] <= 56
--- a/finder/transform.py
+++ b/finder/transform.py
@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
    return normalize_postcode(match.group(1))
 def extract_outcode(postcode: str | None) -> str | None:
    """Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
    if not postcode:
        return None
    normalized = normalize_postcode(postcode)
    outcode = normalized.split(" ", 1)[0]
    return outcode or None
 def resolve_listing_postcode(
    extracted_postcode: str | None, inferred_postcode: str
 ) -> tuple[str, str]:
    """Pick the authoritative postcode for a listing, returning (postcode, source).
    The address-extracted postcode is more precise than the coordinate-nearest one,
    but it is only trustworthy when it agrees with the location: a stale, mistyped or
    well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
    override the spatially-correct value. Since the spatial index only supports
    nearest-lookup, accept the extracted postcode only when its outcode matches the
    inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
    inferred one, which is always a real, plausibly-correct postcode.
    """
    if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
        inferred_postcode
    ):
        return extracted_postcode, "address"
    if extracted_postcode:
        log.debug(
            "Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
            extracted_postcode,
            inferred_postcode,
        )
    return inferred_postcode, "coordinates"
 def clean_listing_address(address: str | None) -> str:
    """Remove postcode/outcode suffixes from listing display addresses.
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
    return cleaned.strip(" ,")
 def build_register_address(
    raw_address: str | None, number_or_name: str | None = None
 ) -> str:
    """Build a Property Register-style address, prepending the house number/name.
    Listing display addresses are usually street-level ("South Street, Bromley")
    because the portals hide the exact unit. When a scraper can recover the
    property's own number or name (e.g. Zoopla detail pages expose
    ``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
    carries the house identifier that the EPC/Price-Paid register addresses also
    use — turning a fuzzy street match into a near-exact one. Falls back to the
    plain cleaned address when no number/name is available.
    """
    cleaned = clean_listing_address(raw_address)
    if not number_or_name:
        return cleaned
    number_or_name = number_or_name.strip()
    if not number_or_name:
        return cleaned
    # Avoid duplicating a number/name the display address already starts with.
    if cleaned.lower().startswith(number_or_name.lower()):
        return cleaned
    return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
 def transform_property(
-    prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
+    prop: dict,
    outcode: str,
    pc_index: PostcodeSpatialIndex,
    detail_postcode: str | None = None,
 ) -> dict | None:
-    """Transform a raw Rightmove property dict into our output schema."""
+    """Transform a raw Rightmove property dict into our output schema.
    ``detail_postcode`` is the property's TRUE full postcode recovered from its
    detail page (see ``rightmove.parse_detail_postcode``); the search API itself
    only exposes the outcode-level ``displayAddress``. When supplied and it
    agrees with the coordinate-nearest postcode's outcode, it is preferred over
    the coordinate guess and recorded with source ``"detail_address"``. A
    detail postcode whose outcode disagrees with the location is discarded in
    favour of the spatially-correct coordinate postcode, so a stale or wrong
    detail value can never silently relocate a listing.
    """
    loc = prop.get("location")
    if not loc:
        return None
@ -268,8 +341,25 @@ def transform_property(
        return None
    raw_address = prop.get("displayAddress", "") or ""
    extracted_postcode = extract_full_postcode(raw_address)
-    postcode = extracted_postcode or inferred_postcode
+
-    postcode_source = "address" if extracted_postcode else "coordinates"
+    # Prefer the detail page's true full postcode when it agrees with the
    # location; otherwise fall back to the (display-address-or-coordinate) logic.
    detail_full = extract_full_postcode(detail_postcode)
    if detail_full and extract_outcode(detail_full) == extract_outcode(
        inferred_postcode
    ):
        postcode, postcode_source = detail_full, "detail_address"
    else:
        if detail_full:
            log.debug(
                "Rejecting Rightmove detail postcode %s (outcode mismatch with "
                "inferred %s)",
                detail_full,
                inferred_postcode,
            )
        postcode, postcode_source = resolve_listing_postcode(
            extracted_postcode, inferred_postcode
        )
    property_url = prop.get("propertyUrl") or ""
    if not isinstance(property_url, str):
@ -291,6 +381,9 @@ def transform_property(
        "Inferred postcode": inferred_postcode,
        "Listing raw address": raw_address,
        "Address per Property Register": clean_listing_address(raw_address),
        # Rightmove's displayAddress is street-level; no UPRN/house number.
        "UPRN": None,
        "Property number or name": None,
        "Leasehold/Freehold": extract_tenure(prop.get("tenure")),
        "Property type": map_property_type(sub_type),
        "Property sub-type": normalize_sub_type(sub_type),
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -32,16 +32,24 @@ import httpx
 from constants import (
    DATA_DIR,
    DELAY_BETWEEN_PAGES,
    GLUETUN_API_KEY,
    GLUETUN_CONTROL_URL,
    GLUETUN_MAX_ROTATIONS,
    GLUETUN_PROXY,
    MAX_BEDROOMS,
    PROPERTY_TYPE_MAP,
    ZOOPLA_BASE,
    ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
 )
 from spatial import PostcodeSpatialIndex
 from transform import (
-    clean_listing_address,
+    build_register_address,
    extract_full_postcode,
    extract_outcode,
    fix_coords,
    normalize_sub_type,
    parse_int_value,
    resolve_listing_postcode,
    validate_floor_area,
 )
@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
 # cookies (bound to the previous IP), then reload and re-check the challenge.
 _GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
 def _gluetun_base_url() -> str:
-    return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
+    return GLUETUN_CONTROL_URL.rstrip("/")
 def _gluetun_api_key() -> str | None:
-    return _GLUETUN_API_KEY
+    return GLUETUN_API_KEY
 def _gluetun_max_rotations() -> int:
-    raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
+    return max(GLUETUN_MAX_ROTATIONS, 0)
    try:
        value = int(raw)
    except ValueError as exc:
        raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
    return max(value, 0)
 def _gluetun_client() -> httpx.Client:
    # Talks to the control server directly (not through the VPN proxy).
    headers = {}
    api_key = _gluetun_api_key()
    if api_key:
@ -694,10 +695,19 @@ def launch_browser():
    profile_dir.mkdir(parents=True, exist_ok=True)
    _remove_stale_profile_locks(profile_dir)
    # Route the browser through the Gluetun VPN proxy when configured. (geoip
    # fingerprint alignment is intentionally not enabled: it needs the optional
    # camoufox[geoip] extra and would spoof to the VPN exit's country, which
    # fights the en-GB locale unless the exit is in the UK.)
    proxy_options: dict = {}
    if GLUETUN_PROXY:
        proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
    log.info(
-        "Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
+        "Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
        headless_mode,
        profile_dir,
        GLUETUN_PROXY or "direct",
    )
    camoufox = Camoufox(
        headless=headless_mode,
@ -705,6 +715,7 @@ def launch_browser():
        user_data_dir=str(profile_dir),
        locale=["en-GB", "en"],
        enable_cache=True,
        **proxy_options,
    )
    raw_browser = camoufox.__enter__()
    browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
@ -926,13 +937,47 @@ def _paginate(
    page,
    total_results: int,
    max_properties: int | None = None,
    fetch_detail=None,
    detail_cap: int = 0,
    detail_state: dict | None = None,
    detail_deadline: float | None = None,
 ) -> list[dict]:
    """Extract listings from all pages of search results.
    Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
    next link when present, otherwise advance via the pn=N URL parameter while
-    the advertised result count says more listings remain."""
+    the advertised result count says more listings remain.
    When ``fetch_detail`` is supplied, each listing has its detail page fetched
    (up to ``detail_cap`` fresh loads per outcode, counted in the shared
    ``detail_state`` dict, and only until ``detail_deadline``) and the parsed
    geo stored under ``listing['_detail']`` for ``transform_property``. The
    detail page is the only source of the listing's UPRN, full street address
    and precise postcode, so it is fetched even when the search card already
    pins a full postcode. Cached detail results are always attached but cost
    neither a cap slot nor a delay."""
    def _maybe_fetch(listing: dict) -> None:
        if fetch_detail is None or detail_state is None:
            return
        url = listing.get("url", "")
        cached = _detail_cache_key(url) in _detail_cache
        if not cached:
            # Fresh loads are bounded by the per-outcode cap and the wall-clock
            # deadline so detail fetching never starves the SIGALRM budget that
            # also guards the search pagination for this outcode.
            if detail_state["fetched"] >= detail_cap:
                return
            if detail_deadline is not None and time.monotonic() >= detail_deadline:
                return
        listing["_detail"] = fetch_detail(url)
        if not cached:
            detail_state["fetched"] += 1
            time.sleep(DELAY_BETWEEN_PAGES)
    all_listings = _extract_listings(page)
    for listing in all_listings:
        _maybe_fetch(listing)
    if max_properties is not None and len(all_listings) >= max_properties:
        return all_listings[:max_properties]
@ -984,6 +1029,7 @@ def _paginate(
            if listing["id"] not in seen_ids:
                seen_ids.add(listing["id"])
                all_listings.append(listing)
                _maybe_fetch(listing)
                new_count += 1
                if max_properties is not None and len(all_listings) >= max_properties:
                    return all_listings[:max_properties]
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
    return None
 # ---------------------------------------------------------------------------
 # Detail-page geocoding
 # ---------------------------------------------------------------------------
 #
 # Zoopla search result cards only expose an outcode-level display address (e.g.
 # "South Street, Bromley BR1"); the full postcode and precise coordinates exist
 # only on each listing's detail page (/for-sale/details/{id}/). The detail page
 # is a Next.js App Router route whose React Server Components flight stream
 # embeds the property's own location object, e.g.
 #   "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
 #               "uprn":"10023461458","postalCode":"NR29 4RG",...}
 # plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
 #               "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
 # Nearby points of interest (stations, schools, EV chargers) and comparable
 # listings carry their own "coordinates" too, but never inside the property's
 # own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
 # loose coordinates object, is what we anchor on (see parse_detail_geo).
 # listingId -> parsed detail dict (or None). Failures are cached too, so a
 # broken listing is not re-fetched within a run (the same listing reappears
 # across overlapping outcode searches).
 _detail_cache: dict[str, dict | None] = {}
 _LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
 # The property's own location is carried by a `"location":{...}` wrapper and a
 # twin `"address":{"fullAddress":...}` widget object. We anchor on those
 # wrappers (and capture their full object body, which contains exactly one
 # nested object — `coordinates`) rather than scanning for loose coordinate
 # objects: nearby points of interest (stations/schools/EV chargers) and
 # comparable/"similar" listings also embed coordinates, but never inside the
 # property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
 # wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
 _DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
 _DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
 _DETAIL_COORDS_IN_BODY_RE = re.compile(
    r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
 )
 _DETAIL_LATLNG_IN_BODY_RE = re.compile(
    r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
 )
 _DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
 # The location object spells it "postalCode"; the address twin uses "postcode".
 _DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
 # The UPRN (Unique Property Reference Number) appears in both the location and
 # address objects and is the linchpin for an exact listing->EPC join (EPC open
 # data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
 # (e.g. "12", "Martham Mill") only in the location object.
 _DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
 _DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
 def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
    """Extract the property's own coordinates/postcode from a Zoopla detail page.
    Pure and browser-free: the live browser only produces the HTML string
    (``page.content()``); this does the parsing so it is unit-testable.
    Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
    "number_or_name", "full_address"}`` (every field except the coordinates may
    be ``None``) or ``None`` when no property location wrapper is found. The
    ``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
    number/name) and ``full_address`` give a register-style address for the
    Price Paid join.
    Coordinates are bounds-checked to England and a postcode is kept only when
    it agrees with its own object's outcode. ``search_outcode``, when given, is
    used only as a tie-break to pick the right ``location`` object on pages that
    also embed comparable listings. See module docstring for the data model."""
    if not html:
        return None
    # RSC flight strings are embedded as escaped JS string literals, so quotes
    # and slashes arrive escaped; normalize them so the regexes match.
    buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
    def in_england(lat: float, lng: float) -> tuple[float, float] | None:
        lat, lng = fix_coords(lat, lng)
        if 49 <= lat <= 56 and -7 <= lng <= 2:
            return lat, lng
        return None
    def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
        # outcode and postcode are read from the SAME object body as the coords,
        # so the postcode is self-consistent; drop it only if it somehow isn't.
        outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
        outcode = outcode_match.group(1) if outcode_match else None
        postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
        postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
        if postcode and outcode and extract_outcode(postcode) != outcode.upper():
            postcode = None
        uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
        number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
        number_or_name = number_match.group(1).strip() if number_match else None
        return {
            "lat": coords[0],
            "lng": coords[1],
            "postcode": postcode,
            "outcode": outcode,
            "source": source,
            "uprn": uprn_match.group(1) if uprn_match else None,
            "number_or_name": number_or_name or None,
            "full_address": full_address,
        }
    def attach_full_address(result: dict | None) -> dict | None:
        # The house-numbered street address lives in the `address` map-widget
        # twin, not the `location` wrapper we anchor coordinates on. Pull it from
        # the twin that shares this property's uprn; when there is no uprn to
        # disambiguate, fall back to the first twin (document order = primary
        # listing), but never guess a twin when a uprn exists and none matches —
        # that would risk grabbing a comparable listing's address.
        if result is None or result.get("full_address"):
            return result
        target = result.get("uprn")
        first = None
        for match in _DETAIL_ADDRESS_RE.finditer(buf):
            full_address = match.group(1) or None
            if full_address is None:
                continue
            if first is None:
                first = full_address
            uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
            if target and uprn_match and uprn_match.group(1) == target:
                result["full_address"] = full_address
                return result
        if target is None:
            result["full_address"] = first
        return result
    # Strategy 1 — the property's own `location` wrapper (authoritative). Take
    # the first match (the primary listing precedes any comparables in the
    # flight stream), but prefer one whose outcode matches the searched outcode.
    first_location = None
    for match in _DETAIL_LOCATION_RE.finditer(buf):
        body = match.group(1)
        coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
        if not coords_match:
            continue
        coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
        if not coords:
            continue
        candidate = build(body, coords, "detail_location")
        if first_location is None:
            first_location = candidate
        if (
            search_outcode
            and candidate["outcode"]
            and candidate["outcode"].upper() == search_outcode.upper()
        ):
            return attach_full_address(candidate)
    if first_location is not None:
        return attach_full_address(first_location)
    # Strategy 2 — the `address` map-widget twin (same coordinates, backup).
    for match in _DETAIL_ADDRESS_RE.finditer(buf):
        full_address = match.group(1) or None
        body = match.group(2)
        latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
        if not latlng_match:
            continue
        coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
        if coords:
            return build(body, coords, "detail_address_obj", full_address=full_address)
    return None
 def _detail_cache_key(listing_url: str) -> str:
    """Cache key for a listing detail page — its numeric id when present."""
    id_match = _LISTING_ID_RE.search(listing_url)
    return id_match.group(1) if id_match else listing_url
 def _fetch_listing_detail(
    detail_page,
    listing_url: str,
    search_outcode: str | None = None,
 ) -> dict | None:
    """Load a listing detail page and return its parsed geo dict (or None).
    Results (including failures) are cached by listingId. Ordinary navigation
    and extraction errors are swallowed so the caller can fall back to
    outcode-level resolution, but TurnstileError is allowed to propagate so the
    scraper's "Cloudflare ends the run" contract still holds. The goto timeout
    is kept short so one slow detail page can't eat the per-outcode budget."""
    cache_key = _detail_cache_key(listing_url)
    if cache_key in _detail_cache:
        return _detail_cache[cache_key]
    url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
    result: dict | None = None
    try:
        detail_page.goto(
            url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
        )
        _ensure_not_challenged(detail_page)
        html = detail_page.content()
        result = parse_detail_geo(html, search_outcode=search_outcode)
    except TurnstileError:
        raise
    except Exception as exc:
        log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
        result = None
    _detail_cache[cache_key] = result
    return result
 def _map_property_type(raw_type: str | None) -> str:
    """Map Zoopla property type text to canonical type."""
    if not raw_type:
@ -1109,28 +1363,64 @@ def transform_property(
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    search_outcode: str | None = None,
    detail: dict | None = None,
 ) -> dict | None:
    """Transform a raw Zoopla listing dict into the standard output schema.
-    Zoopla search cards do not include coordinates, so we resolve lat/lng
+    Zoopla search cards only expose an outcode-level address, so precise
-    from postcodes extracted from the address text."""
+    location comes from the listing's detail page (see ``parse_detail_geo`` /
    ``_fetch_listing_detail``), passed in as ``detail``. When detail-page
    coordinates are available we resolve the nearest postcode via the spatial
    index — mirroring rightmove/onthemarket — and only fall back to the coarse
    outcode centroid when no detail location could be obtained."""
    price = parse_int_value(raw.get("price")) or 0
    address = raw.get("address", "") or ""
    # Resolve postcode and coordinates from address
    extracted_postcode = extract_full_postcode(address)
-    postcode = extracted_postcode
+    detail = detail or {}
-    postcode_source = "address" if extracted_postcode else None
+    detail_postcode = extract_full_postcode(detail.get("postcode"))
    # Detail-page address fields: the UPRN keys an exact EPC join, and the
    # full street address / house number-or-name beat the outcode-level card
    # address for the Price-Paid join. All three are absent unless the detail
    # page was fetched, so every consumer must tolerate None.
    detail_uprn = detail.get("uprn") or None
    detail_full_address = detail.get("full_address") or None
    detail_number_or_name = detail.get("number_or_name") or None
    postcode = postcode_source = inferred_postcode = None
    lat = lng = None
-    if postcode:
+    # (A) Best: detail-page coordinates -> nearest postcode (authoritative).
-        coords = pc_coords.get(postcode)
+    detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
-        if coords:
+    if detail_lat is not None and detail_lng is not None:
-            lat, lng = coords
+        fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
        if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
            nearest = pc_index.nearest(fixed_lat, fixed_lng)
            if nearest:
                lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
                candidate = detail_postcode or extracted_postcode
                postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
                postcode_source = (
                    "detail_address"
                    if resolved_source == "address"
                    else "detail_coordinates"
                )
    # (B) Detail-page postcode without usable coordinates -> geocode it.
    if lat is None and detail_postcode and detail_postcode in pc_coords:
        lat, lng = pc_coords[detail_postcode]
        postcode = inferred_postcode = detail_postcode
        postcode_source = "detail_address"
    # (C) Full postcode in the search-card address -> geocode it.
    if lat is None and extracted_postcode and extracted_postcode in pc_coords:
        lat, lng = pc_coords[extracted_postcode]
        postcode = extracted_postcode
        postcode_source = "address"
    # (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
    if lat is None:
        # Try outcode-level fallback from address text
        addr_outcode = _extract_outcode(address)
        if addr_outcode:
            result = _resolve_outcode_coords(addr_outcode, pc_coords)
@ -1138,7 +1428,6 @@ def transform_property(
                postcode, lat, lng = result
                postcode_source = "address_outcode"
    # Final fallback: use the outcode we know we're searching
    if lat is None and search_outcode:
        result = _resolve_outcode_coords(search_outcode, pc_coords)
        if result:
@ -1188,9 +1477,17 @@ def transform_property(
        "Postcode": postcode,
        "Postcode source": postcode_source or "unknown",
        "Extracted postcode": extracted_postcode,
-        "Inferred postcode": postcode if postcode_source != "address" else None,
+        "Inferred postcode": (
-        "Listing raw address": address,
+            inferred_postcode
-        "Address per Property Register": clean_listing_address(address),
+            if inferred_postcode is not None
            else (postcode if postcode_source != "address" else None)
        ),
        "Listing raw address": detail_full_address or address,
        "Address per Property Register": build_register_address(
            detail_full_address or address, detail_number_or_name
        ),
        "UPRN": detail_uprn,
        "Property number or name": detail_number_or_name,
        "Leasehold/Freehold": raw.get("tenure") or None,
        "Property type": _map_property_type(raw.get("property_type")),
        "Property sub-type": normalize_sub_type(raw.get("property_type")),
@ -1215,6 +1512,9 @@ def search_outcode(
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    max_properties: int | None = None,
    detail_page=None,
    detail_cap: int = 0,
    detail_budget_seconds: float | None = None,
 ) -> tuple[list[dict], str | None]:
    """Search Zoopla for properties in one outcode.
@ -1222,6 +1522,12 @@ def search_outcode(
    search flow, extracts listings from rendered DOM, and transforms to the
    standard output schema.
    When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
    are supplied, up to ``detail_cap`` listings per outcode have their detail
    page fetched for a precise postcode (see ``_fetch_listing_detail``).
    ``detail_budget_seconds`` caps the wall-clock time spent fetching details so
    the per-outcode timeout that also guards search pagination is never starved.
    Returns (properties, search_url).
    Raises TurnstileError if Cloudflare blocks us mid-session.
@ -1231,12 +1537,25 @@ def search_outcode(
    total_results = _get_result_count(page)
    fetch_detail = None
    detail_deadline = None
    if detail_page is not None and detail_cap > 0:
        fetch_detail = lambda url: _fetch_listing_detail(  # noqa: E731
            detail_page, url, search_outcode=outcode
        )
        if detail_budget_seconds is not None:
            detail_deadline = time.monotonic() + detail_budget_seconds
    # Always try extraction even if result count is 0 — the count regex may
    # not match Zoopla's current text format, but listings may still be in DOM
    raw_listings = _paginate(
        page,
        total_results,
        max_properties=max_properties,
        fetch_detail=fetch_detail,
        detail_cap=detail_cap,
        detail_state={"fetched": 0},
        detail_deadline=detail_deadline,
    )
    if not raw_listings:
        if total_results > 0:
@ -1252,7 +1571,11 @@ def search_outcode(
    for raw in raw_listings:
        try:
            transformed = transform_property(
-                raw, pc_index, pc_coords, search_outcode=outcode
+                raw,
                pc_index,
                pc_coords,
                search_outcode=outcode,
                detail=raw.get("_detail"),
            )
        except Exception as exc:
            log.warning(
--- a/finder/zoopla_flaresolverr.py
+++ b/finder/zoopla_flaresolverr.py
@ -0,0 +1,164 @@
 """Zoopla scraping via FlareSolverr (no browser/VNC needed).
 FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
 still contains the React Server Components flight stream — so the existing pure
 parsers work unchanged:
  - the search page yields the outcode's listing detail URLs, and
  - each detail page's flight stream carries the property's location object
    (postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
    listing fields (price/beds/baths/tenure/floor area) parsed here.
 Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
 FlareSolverr session solves the SW9 search + detail pages and the flight data
 is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
 This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
 path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
 """
 import logging
 import re
 import time
 from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
 from flaresolverr import FlareSolverrError, FlareSolverrSession
 from spatial import PostcodeSpatialIndex
 from zoopla import _url_with_page, parse_detail_geo, transform_property
 log = logging.getLogger("zoopla")
 # Safety bound on how many search-result pages to walk per outcode.
 _MAX_SERP_PAGES = 60
 _DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
 _LISTING_ID_RE = re.compile(r"/details/(\d+)/")
 def _int(pattern: str, buf: str) -> int | None:
    match = re.search(pattern, buf)
    return int(match.group(1)) if match else None
 def parse_detail_listing(html: str) -> dict:
    """Extract the non-location listing fields from a Zoopla detail page.
    Mirrors the fields the Camoufox SERP-card extractor produced, read from the
    detail page's flight stream (validated against real Zoopla detail HTML).
    All fields are best-effort; missing ones default to None so a listing with
    a known location is still emitted."""
    buf = html.replace('\\"', '"').replace("\\/", "/")
    price = _int(r'"internalValue":(\d+)', buf)
    if price is None:
        price = _int(r'"priceUnformatted":(\d+)', buf)
    tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
    tenure = tenure_match.group(1).title() if tenure_match else None
    # Address + property type come from the page <title>, e.g.
    # "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
    address = None
    property_type = None
    title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
    if title_match:
        title = title_match.group(1)
        addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
        if addr_match:
            address = addr_match.group(1).strip()
        type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
        if type_match:
            property_type = type_match.group(1).strip()
    explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
    if explicit_type:
        property_type = explicit_type.group(1)
    return {
        "price": price,
        "beds": _int(r'"numBedrooms":(\d+)', buf),
        "baths": _int(r'"numBaths":(\d+)', buf),
        "receptions": _int(r'"numLivingRooms":(\d+)', buf),
        "floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
        "tenure": tenure,
        "property_type": property_type,
        "address": address,
    }
 def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
    """Walk the outcode's search-result pages and collect listing detail paths."""
    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
    seen: list[str] = []
    seen_ids: set[str] = set()
    for page_num in range(1, _MAX_SERP_PAGES + 1):
        url = base if page_num == 1 else _url_with_page(base, page_num)
        html = fs.get(url)
        new = 0
        for path in _DETAIL_PATH_RE.findall(html):
            id_match = _LISTING_ID_RE.search(path)
            listing_id = id_match.group(1) if id_match else path
            if listing_id in seen_ids:
                continue
            seen_ids.add(listing_id)
            seen.append(path)
            new += 1
            if limit is not None and len(seen) >= limit:
                return seen
        if new == 0:
            break
        time.sleep(DELAY_BETWEEN_PAGES)
    return seen
 def search_outcode(
    outcode: str,
    pc_index: PostcodeSpatialIndex,
    pc_coords: dict[str, tuple[float, float]],
    fs: FlareSolverrSession,
    max_properties: int | None = None,
    detail_cap: int = 0,
    detail_budget_seconds: float | None = None,
 ) -> tuple[list[dict], str | None]:
    """Scrape one outcode via FlareSolverr. Returns (properties, search_url).
    Every listing's detail page is fetched (that is where the postcode lives),
    so the effective listing count is bounded by both ``max_properties`` and
    ``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
    limit = detail_cap if detail_cap and detail_cap > 0 else None
    if max_properties is not None:
        limit = max_properties if limit is None else min(limit, max_properties)
    base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
    paths = _enumerate_detail_paths(fs, outcode, limit)
    if not paths:
        return [], base
    deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
    properties: list[dict] = []
    dropped = 0
    for path in paths:
        if deadline is not None and time.monotonic() >= deadline:
            log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
            break
        id_match = _LISTING_ID_RE.search(path)
        listing_id = id_match.group(1) if id_match else path
        try:
            html = fs.get(ZOOPLA_BASE + path)
            geo = parse_detail_geo(html, search_outcode=outcode)
            raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
            prop = transform_property(
                raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
            )
        except FlareSolverrError as exc:
            log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
            prop = None
        except Exception as exc:  # noqa: BLE001 - never let one listing kill the outcode
            log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
            prop = None
        if prop:
            properties.append(prop)
        else:
            dropped += 1
        time.sleep(DELAY_BETWEEN_PAGES)
    log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
    return properties, base
--- a/frontend/src/components/map/Map.tsx
+++ b/frontend/src/components/map/Map.tsx
@ -606,12 +606,13 @@ function OverlayTileLayers({
  const showTrees = activeOverlays.has('trees-outside-woodlands');
  const showPropertyBorders = activeOverlays.has('property-borders');
-  // Restrict the heatmap to the selected crime types. When every type is
+  // Restrict the heatmap to the selected crime types. This must always be a
-  // selected we omit the filter entirely so all features contribute.
+  // concrete expression: passing `filter={undefined}` makes react-map-gl call
-  const crimeFilter =
+  // map.addLayer({filter: undefined}), which MapLibre rejects at validation
-    activeCrimeTypes.size >= CRIME_TYPE_VALUES.length
+  // ("filter: array expected, undefined found"), so the layer is never created
-      ? undefined
+  // and the heatmap stays blank until a later setFilter call. An `in` over the
-      : ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]];
+  // selected types matches everything when all 14 are selected.
  const crimeFilter = ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]];
  return (
    <>
--- a/frontend/src/components/map/MobileDrawer.test.tsx
+++ b/frontend/src/components/map/MobileDrawer.test.tsx
@ -0,0 +1,107 @@
 import { cleanup, fireEvent, render, screen } from '@testing-library/react';
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import MobileDrawer from './MobileDrawer';
 vi.mock('react-i18next', () => ({
  useTranslation: () => ({
    t: (key: string) => key,
  }),
 }));
 const originalSetPointerCapture = HTMLElement.prototype.setPointerCapture;
 function renderDrawer(onClose = vi.fn()) {
  const view = render(
    <MobileDrawer
      onClose={onClose}
      renderArea={() => <div>Area content</div>}
      renderProperties={() => <div>Properties content</div>}
      tab="area"
      onTabChange={vi.fn()}
    />
  );
  const handle = view.container.querySelector('[data-mobile-drawer-drag-handle]');
  const root = view.container.querySelector('[data-tutorial="right-pane"]');
  const panel = view.container.querySelector('[data-tutorial="right-pane"] > div:last-child');
  if (!(handle instanceof HTMLElement)) throw new Error('Expected drawer drag handle');
  if (!(root instanceof HTMLElement)) throw new Error('Expected drawer root');
  if (!(panel instanceof HTMLElement)) throw new Error('Expected drawer panel');
  return { ...view, handle, onClose, panel, root };
 }
 describe('MobileDrawer', () => {
  beforeEach(() => {
    HTMLElement.prototype.setPointerCapture = vi.fn();
  });
  afterEach(() => {
    cleanup();
    Object.defineProperty(HTMLElement.prototype, 'setPointerCapture', {
      configurable: true,
      value: originalSetPointerCapture,
    });
  });
  it('lowers and stays open when swiped down from the handle', () => {
    const { handle, onClose, panel } = renderDrawer();
    fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
    fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
    fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
    expect(onClose).not.toHaveBeenCalled();
    expect(panel.style.transform).toBe('translateY(110px)');
  });
  it('can be raised again after being lowered', () => {
    const { handle, onClose, panel } = renderDrawer();
    fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
    fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
    fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
    fireEvent.pointerDown(handle, { pointerId: 2, clientY: 230 });
    fireEvent.pointerMove(handle, { pointerId: 2, clientY: 170 });
    fireEvent.pointerUp(handle, { pointerId: 2, clientY: 170 });
    expect(onClose).not.toHaveBeenCalled();
    expect(panel.style.transform).toBe('translateY(50px)');
  });
  it('keeps the close control reachable when dragged down far', () => {
    const { handle, panel } = renderDrawer();
    Object.defineProperty(panel, 'offsetHeight', {
      configurable: true,
      value: 200,
    });
    fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
    fireEvent.pointerMove(handle, { pointerId: 1, clientY: 420 });
    fireEvent.pointerUp(handle, { pointerId: 1, clientY: 420 });
    expect(panel.style.transform).toBe('translateY(96px)');
  });
  it('leaves the rest of the mobile map usable while the panel is open', () => {
    const { panel, root } = renderDrawer();
    const spacer = root.firstElementChild;
    if (!(spacer instanceof HTMLElement)) throw new Error('Expected drawer spacer');
    expect(root.className).toContain('pointer-events-none');
    expect(panel.className).toContain('pointer-events-auto');
    expect(spacer.className).not.toContain('bg-black');
  });
  it('closes from the close button', () => {
    const { onClose } = renderDrawer();
    fireEvent.click(screen.getByLabelText('mobileDrawer.closeDrawer'));
    expect(onClose).toHaveBeenCalledTimes(1);
  });
 });
--- a/frontend/src/lib/color-opacity.ts
+++ b/frontend/src/lib/color-opacity.ts
@ -0,0 +1,11 @@
 export const DEFAULT_COLOR_OPACITY = 1;
 export const MIN_COLOR_OPACITY = 0.1;
 export function normalizeColorOpacity(value: number | null | undefined): number {
  if (value == null || !Number.isFinite(value)) return DEFAULT_COLOR_OPACITY;
  return Math.min(1, Math.max(MIN_COLOR_OPACITY, value));
 }
 export function colorOpacityToPercent(value: number): number {
  return Math.round(normalizeColorOpacity(value) * 100);
 }
--- a/frontend/src/lib/crime-types.ts
+++ b/frontend/src/lib/crime-types.ts
@ -0,0 +1,35 @@
 // Street-crime categories carried by the `crime_hotspots` vector tiles in the
 // `crime_type` feature property. The `value` strings must match the police.uk
 // "Crime type" values exactly (see pipeline/transform/crime_hotspot_tiles.py),
 // because they are used directly in the MapLibre heatmap `filter` expression.
 // `label` is a shorter, human-friendly name for the overlay-selector checkboxes.
 export interface CrimeTypeDef {
  value: string;
  label: string;
 }
 export const CRIME_TYPES: readonly CrimeTypeDef[] = [
  { value: 'Violence and sexual offences', label: 'Violence & sexual offences' },
  { value: 'Anti-social behaviour', label: 'Anti-social behaviour' },
  { value: 'Criminal damage and arson', label: 'Criminal damage & arson' },
  { value: 'Public order', label: 'Public order' },
  { value: 'Shoplifting', label: 'Shoplifting' },
  { value: 'Vehicle crime', label: 'Vehicle crime' },
  { value: 'Burglary', label: 'Burglary' },
  { value: 'Other theft', label: 'Other theft' },
  { value: 'Theft from the person', label: 'Theft from the person' },
  { value: 'Bicycle theft', label: 'Bicycle theft' },
  { value: 'Drugs', label: 'Drugs' },
  { value: 'Robbery', label: 'Robbery' },
  { value: 'Possession of weapons', label: 'Possession of weapons' },
  { value: 'Other crime', label: 'Other crime' },
 ] as const;
 export const CRIME_TYPE_VALUES: readonly string[] = CRIME_TYPES.map((c) => c.value);
 const CRIME_TYPE_VALUE_SET = new Set<string>(CRIME_TYPE_VALUES);
 export function isCrimeTypeValue(value: string): boolean {
  return CRIME_TYPE_VALUE_SET.has(value);
 }
--- a/pipeline/download/inspire.py
+++ b/pipeline/download/inspire.py
@ -4,7 +4,10 @@ Downloads GML files for all local authorities from the INSPIRE download page.
 Each ZIP contains a GML file with title extent polygons for that authority.
 Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
-License: INSPIRE End User Licence
+License: Open Government Licence v3.0 (since 1 July 2020, under the PSGA).
    Requires HM Land Registry + Ordnance Survey (AC0000851063) attribution; see
    the conditions page at the source URL. Boundaries are indicative "general
    boundaries", not the legal extent of title.
 """
 import argparse
--- a/pipeline/download/satellite_highres.py
+++ b/pipeline/download/satellite_highres.py
@ -0,0 +1,505 @@
 """Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
 The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
 (OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
 British National Grid. There is no public imagery tile service, so we mirror the
 Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
 an area of interest, pick the best RGB capture per OS tile, download and decode the
 ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
 archive that the server stacks *over* the Sentinel-2 base where coverage exists.
 ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
 SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
 runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
 rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import shutil
 import sqlite3
 import subprocess
 import tempfile
 import urllib.error
 import urllib.request
 import zipfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from pipeline.download.tiles import ensure_pmtiles_cli
 from pipeline.local_temp import local_tmp_dir
 # Defra Data Services Platform survey download API (reverse-engineered from the
 # environment.data.gov.uk/survey front-end; no official API is documented).
 SEARCH_URL = (
    "https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
 )
 SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
 # Static public key baked into the survey page JS. May rotate -- we try to scrape a
 # fresh one from the page and only fall back to this literal.
 DEFAULT_SUBSCRIPTION_KEY = "dspui"
 SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
 # True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
 VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
 # Greater London bounding box (lon/lat). The API only returns tiles where coverage
 # exists, so a generous bbox is fine -- it does not force blank downloads.
 DEFAULT_AOI: dict = {
    "type": "Polygon",
    "coordinates": [
        [
            [-0.55, 51.25],
            [0.30, 51.25],
            [0.30, 51.70],
            [-0.55, 51.70],
            [-0.55, 51.25],
        ]
    ],
 }
 DEFAULT_MIN_ZOOM = 14
 DEFAULT_MAX_ZOOM = 19
 # GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
 # this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
 DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
 USER_AGENT = "perfect-postcode-satellite-highres/1.0"
 ATTRIBUTION_TEMPLATE = (
    "Environment Agency Vertical Aerial Photography - "
    "© Environment Agency copyright and/or database right {year}. "
    "All rights reserved. Licensed under the Open Government Licence v3.0."
 )
@dataclass(frozen=True)
 class VapTile:
    """One survey download record from the Defra search API."""
    product_id: str
    year: int
    resolution_m: float
    os_tile_id: str
    uri: str
    label: str
 def parse_search_results(payload: dict) -> list[VapTile]:
    """Turn a raw search-API JSON payload into typed records."""
    tiles: list[VapTile] = []
    for result in payload.get("results", []):
        try:
            tiles.append(
                VapTile(
                    product_id=result["product"]["id"],
                    year=int(result["year"]["id"]),
                    resolution_m=float(result["resolution"]["id"]),
                    os_tile_id=result["tile"]["id"],
                    uri=result["uri"],
                    label=result.get("label", ""),
                )
            )
        except (KeyError, TypeError, ValueError):
            # Skip malformed records rather than failing the whole search.
            continue
    return tiles
 def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
    """Pick one RGB capture per OS tile: finest resolution, then latest year.
    Pure function -- the unit test exercises this against a real-shaped payload.
    """
    best: dict[str, VapTile] = {}
    for tile in tiles:
        if tile.product_id != VAP_RGB_PRODUCT:
            continue
        current = best.get(tile.os_tile_id)
        if current is None or _is_better(tile, current):
            best[tile.os_tile_id] = tile
    return [best[key] for key in sorted(best)]
 def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
    """Finer resolution wins; ties broken by the most recent survey year."""
    if candidate.resolution_m != incumbent.resolution_m:
        return candidate.resolution_m < incumbent.resolution_m
    return candidate.year > incumbent.year
 def _http_get(url: str, timeout: float) -> bytes:
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    with urllib.request.urlopen(req, timeout=timeout) as response:
        return response.read()
 def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
    """Use an explicit key, else scrape the survey page JS, else the known default."""
    if explicit:
        return explicit
    try:
        page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
        match = SUBSCRIPTION_KEY_RE.search(page)
        if match:
            return match.group(1)
        # The key usually lives in a referenced JS chunk; scan the largest one.
        for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
            js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
            match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
            if match:
                return match.group(1)
    except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
        print(f"Could not scrape subscription key ({err}); using default", flush=True)
    return DEFAULT_SUBSCRIPTION_KEY
 def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
    """POST the area-of-interest polygon and return the RGB tiles to download."""
    body = json.dumps(aoi).encode("utf-8")
    req = urllib.request.Request(
        SEARCH_URL,
        data=body,
        headers={
            "Content-Type": "application/geo+json",
            "Referer": SURVEY_PAGE_URL,
            "User-Agent": USER_AGENT,
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=timeout) as response:
        payload = json.load(response)
    selected = select_best_rgb_tiles(parse_search_results(payload))
    print(
        f"Search returned {payload.get('count', 0)} records; "
        f"selected {len(selected)} RGB tile(s)",
        flush=True,
    )
    return selected
 def _download_and_extract(
    tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
 ) -> list[Path]:
    """Download one survey zip and extract its ECW raster(s)."""
    url = f"{tile.uri}?subscription-key={key}"
    zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
    last_error: Exception | None = None
    for attempt in range(retries + 1):
        try:
            with urllib.request.urlopen(
                urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
                timeout=timeout,
            ) as response, zip_path.open("wb") as out:
                shutil.copyfileobj(response, out, length=1 << 20)
            break
        except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
            last_error = err
            if attempt == retries:
                raise RuntimeError(f"Failed to download {url}: {err}") from err
    extracted: list[Path] = []
    with zipfile.ZipFile(zip_path) as archive:
        for member in archive.infolist():
            if member.is_dir() or not member.filename.lower().endswith(".ecw"):
                continue
            target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
            with archive.open(member) as src, target.open("wb") as dst:
                shutil.copyfileobj(src, dst, length=1 << 20)
            extracted.append(target)
    zip_path.unlink(missing_ok=True)
    if not extracted:
        print(f"  {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
    return extracted
 def download_tiles(
    tiles: list[VapTile],
    ecw_dir: Path,
    key: str,
    max_workers: int,
    timeout: float,
    retries: int,
 ) -> list[Path]:
    """Download every selected tile concurrently; return all extracted ECW paths."""
    ecw_dir.mkdir(parents=True, exist_ok=True)
    ecw_paths: list[Path] = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(
                _download_and_extract, tile, ecw_dir, key, timeout, retries
            ): tile
            for tile in tiles
        }
        done = 0
        for future in as_completed(futures):
            tile = futures[future]
            ecw_paths.extend(future.result())
            done += 1
            print(
                f"Downloaded {done}/{len(tiles)} tiles "
                f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
                flush=True,
            )
    return ecw_paths
 def _build_tiles_with_gdal(
    work_dir: Path,
    gdal_image: str,
    min_zoom: int,
    max_zoom: int,
    jobs: int,
    webp_quality: int,
 ) -> Path:
    """Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
    Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
    an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
    keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
    shows through them.
    """
    xyz_dir = work_dir / "xyz"
    # EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
    # so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
    # exceed PNG's 4-band limit). We then:
    #   * force EPSG:27700 -- the pixels are already British National Grid, and the
    #     EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
    #     for metre-accurate reprojection to Web Mercator;
    #   * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
    #     gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
    #     Sentinel-2 base show through wherever VAP coverage is missing.
    script = (
        "set -euo pipefail; "
        "cd /work; "
        "gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
        "gdal_edit.py -a_srs EPSG:27700 "
        "-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
        "-colorinterp_4 alpha mosaic.vrt; "
        f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
        f"--processes={jobs} --resampling=average --webviewer=none "
        f"--tiledriver=WEBP --webp-quality={webp_quality} "
        "mosaic.vrt xyz"
    )
    subprocess.run(
        [
            "docker",
            "run",
            "--rm",
            "-v",
            f"{work_dir.resolve()}:/work",
            gdal_image,
            "bash",
            "-c",
            script,
        ],
        check=True,
    )
    if not xyz_dir.exists():
        raise RuntimeError("gdal2tiles produced no output directory")
    return xyz_dir
 def _pack_xyz_to_mbtiles(
    xyz_dir: Path,
    mbtiles_path: Path,
    bounds: tuple[float, float, float, float],
    min_zoom: int,
    max_zoom: int,
    attribution: str,
 ) -> int:
    """Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
    if mbtiles_path.exists():
        mbtiles_path.unlink()
    conn = sqlite3.connect(mbtiles_path)
    try:
        conn.execute("PRAGMA journal_mode = WAL")
        conn.execute("PRAGMA synchronous = NORMAL")
        conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
        conn.execute(
            "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
            "tile_row INTEGER, tile_data BLOB)"
        )
        conn.execute(
            "CREATE UNIQUE INDEX tile_index ON tiles "
            "(zoom_level, tile_column, tile_row)"
        )
        conn.executemany(
            "INSERT INTO metadata (name, value) VALUES (?, ?)",
            [
                ("name", "EA Vertical Aerial Photography"),
                ("type", "overlay"),
                ("version", "1"),
                ("description", "Environment Agency high-resolution aerial imagery"),
                ("format", "webp"),
                ("attribution", attribution),
                ("bounds", ",".join(f"{value:.6f}" for value in bounds)),
                ("minzoom", str(min_zoom)),
                ("maxzoom", str(max_zoom)),
            ],
        )
        inserted = 0
        for zoom_dir in sorted(xyz_dir.iterdir()):
            if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
                continue
            zoom = int(zoom_dir.name)
            for col_dir in zoom_dir.iterdir():
                if not col_dir.is_dir() or not col_dir.name.isdigit():
                    continue
                col = int(col_dir.name)
                for tile_file in col_dir.glob("*.webp"):
                    if not tile_file.stem.isdigit():
                        continue
                    row = int(tile_file.stem)
                    tms_row = (1 << zoom) - 1 - row
                    conn.execute(
                        "INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
                        (zoom, col, tms_row, tile_file.read_bytes()),
                    )
                    inserted += 1
                    if inserted % 5000 == 0:
                        conn.commit()
                        print(f"  packed {inserted:,} tiles", flush=True)
        conn.commit()
    finally:
        conn.close()
    return inserted
 def build_satellite_highres_tiles(
    output_path: Path,
    pmtiles_bin: Path,
    pmtiles_version: str,
    aoi: dict,
    min_zoom: int,
    max_zoom: int,
    gdal_image: str,
    subscription_key: str | None,
    max_workers: int,
    timeout: float,
    retries: int,
    jobs: int,
    webp_quality: int,
 ) -> None:
    if min_zoom > max_zoom:
        raise ValueError("--min-zoom must be <= --max-zoom")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
    tiles = search_vap_tiles(aoi)
    if not tiles:
        raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
    key = resolve_subscription_key(subscription_key)
    attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
        work_dir = Path(tmp)
        ecw_dir = work_dir / "ecw"
        ecw_paths = download_tiles(
            tiles, ecw_dir, key, max_workers, timeout, retries
        )
        if not ecw_paths:
            raise RuntimeError("No ECW rasters were extracted from the downloads")
        xyz_dir = _build_tiles_with_gdal(
            work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
        )
        mbtiles_path = work_dir / "satellite_highres.mbtiles"
        bounds = _aoi_bounds(aoi)
        inserted = _pack_xyz_to_mbtiles(
            xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
        )
        if inserted == 0:
            raise RuntimeError("Tiling produced no tiles to pack")
        print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
        subprocess.run(
            [str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
            check=True,
        )
    size_mb = output_path.stat().st_size / (1024 * 1024)
    print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
 def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
    coords = [point for ring in aoi["coordinates"] for point in ring]
    lons = [point[0] for point in coords]
    lats = [point[1] for point in coords]
    return min(lons), min(lats), max(lons), max(lats)
 def _load_aoi(path: Path | None) -> dict:
    if path is None:
        return DEFAULT_AOI
    data = json.loads(path.read_text())
    if data.get("type") == "FeatureCollection":
        return data["features"][0]["geometry"]
    if data.get("type") == "Feature":
        return data["geometry"]
    return data
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--output", type=Path, required=True)
    parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
    parser.add_argument("--pmtiles-version", default="1.22.3")
    parser.add_argument(
        "--aoi-geojson",
        type=Path,
        default=None,
        help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
        "(default: Greater London)",
    )
    parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
    parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
    parser.add_argument(
        "--gdal-image",
        default=DEFAULT_GDAL_IMAGE,
        help="Docker image with a GDAL that has the ECW driver",
    )
    parser.add_argument(
        "--subscription-key",
        default=None,
        help="Override the Defra survey API key (default: scrape, then 'dspui')",
    )
    parser.add_argument("--max-workers", type=int, default=4)
    parser.add_argument("--timeout", type=float, default=600.0)
    parser.add_argument("--retries", type=int, default=3)
    parser.add_argument(
        "--jobs",
        type=int,
        default=8,
        help="Parallel processes for gdal2tiles",
    )
    parser.add_argument(
        "--webp-quality",
        type=int,
        default=85,
        help="WebP tile quality (1-100); lower is smaller",
    )
    args = parser.parse_args()
    build_satellite_highres_tiles(
        output_path=args.output,
        pmtiles_bin=args.pmtiles_bin,
        pmtiles_version=args.pmtiles_version,
        aoi=_load_aoi(args.aoi_geojson),
        min_zoom=args.min_zoom,
        max_zoom=args.max_zoom,
        gdal_image=args.gdal_image,
        subscription_key=args.subscription_key,
        max_workers=max(1, args.max_workers),
        timeout=args.timeout,
        retries=max(0, args.retries),
        jobs=max(1, args.jobs),
        webp_quality=args.webp_quality,
    )
 if __name__ == "__main__":
    main()
--- a/pipeline/download/test_satellite_highres.py
+++ b/pipeline/download/test_satellite_highres.py
@ -0,0 +1,97 @@
 from pipeline.download import satellite_highres
 from pipeline.download.satellite_highres import (
    VapTile,
    parse_search_results,
    select_best_rgb_tiles,
 )
 def _result(product: str, year: str, resolution: str, tile: str) -> dict:
    """One search-API record in the real response shape."""
    return {
        "product": {"id": product, "label": product},
        "year": {"id": year, "label": year},
        "resolution": {"id": resolution, "label": f"{resolution}m"},
        "tile": {"id": tile, "label": tile},
        "label": f"{product}-{year}-{resolution}m-{tile}",
        "uri": (
            "https://environment.data.gov.uk/tiles/collections/survey/"
            f"{product}/{year}/{resolution}/{tile}"
        ),
    }
 # Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011),
 # plus Night Time and LIDAR products that must be ignored.
 SAMPLE_PAYLOAD = {
    "count": 6,
    "results": [
        _result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
        _result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"),
        _result("lidar_composite_dtm", "2022", "1", "TQ2575"),
        # TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer.
        _result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"),
        _result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"),
        _result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"),
    ],
 }
 def test_parse_search_results_skips_malformed_records() -> None:
    payload = {
        "results": [
            _result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
            {"product": {"id": "broken"}},  # missing year/resolution/tile/uri
        ]
    }
    tiles = parse_search_results(payload)
    assert len(tiles) == 1
    assert tiles[0] == VapTile(
        product_id="vertical_aerial_photography_tiles_rgb",
        year=2008,
        resolution_m=0.4,
        os_tile_id="TQ2575",
        uri="https://environment.data.gov.uk/tiles/collections/survey/"
        "vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575",
        label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575",
    )
 def test_select_best_rgb_filters_non_rgb_products() -> None:
    selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
    assert {tile.product_id for tile in selected} == {
        satellite_highres.VAP_RGB_PRODUCT
    }
 def test_select_best_rgb_one_tile_per_os_square() -> None:
    selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
    assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"]
 def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None:
    selected = {
        tile.os_tile_id: tile
        for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
    }
    # TQ2575: only one RGB capture.
    assert selected["TQ2575"].resolution_m == 0.4
    # TQ3080: finest resolution (0.1m) wins even though it is the older survey.
    assert selected["TQ3080"].resolution_m == 0.1
    assert selected["TQ3080"].year == 2008
 def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
    tiles = [
        VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"),
        VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"),
        VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"),
    ]
    selected = select_best_rgb_tiles(tiles)
    assert len(selected) == 1
    assert selected[0].year == 2018
 def test_select_best_rgb_empty_when_no_rgb() -> None:
    payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
    assert select_best_rgb_tiles(parse_search_results(payload)) == []
--- a/pipeline/test_validate_outputs.py
+++ b/pipeline/test_validate_outputs.py
@ -1,12 +1,25 @@
 from __future__ import annotations
 import zipfile
 import json
 import polars as pl
 from pipeline.validate_outputs import main
 def write_boundary(path, postcodes):
    units = path / "units"
    units.mkdir(parents=True)
    features = [
        {"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None}
        for postcode in postcodes
    ]
    (units / "AA1.geojson").write_text(
        json.dumps({"type": "FeatureCollection", "features": features})
    )
 def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
    parquet_path = tmp_path / "data.parquet"
    file_path = tmp_path / "plain.txt"
@ -59,3 +72,42 @@ def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
    assert "empty file" in stderr
    assert "missing" in stderr
    assert "no files matched" in stderr
 def test_validates_postcode_boundary_matches(tmp_path, monkeypatch):
    postcodes_path = tmp_path / "postcodes.parquet"
    boundaries_path = tmp_path / "postcode_boundaries"
    pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
    write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"])
    monkeypatch.setattr(
        "sys.argv",
        [
            "validate_outputs",
            "--postcode-boundary-match",
            f"{postcodes_path}::{boundaries_path}",
        ],
    )
    assert main() == 0
 def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
    postcodes_path = tmp_path / "postcodes.parquet"
    boundaries_path = tmp_path / "postcode_boundaries"
    pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
    write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"])
    monkeypatch.setattr(
        "sys.argv",
        [
            "validate_outputs",
            "--postcode-boundary-match",
            f"{postcodes_path}::{boundaries_path}",
        ],
    )
    assert main() == 1
    stderr = capsys.readouterr().err
    assert "missing boundaries" in stderr
    assert "boundary postcodes are absent" in stderr
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -0,0 +1,358 @@
 """Aggregate police.uk street crime to postcodes by 50m spatial proximity.
 Instead of attributing each incident to its published LSOA code, this transform
 counts the anonymised incident *points* that fall within 50m of each postcode's
 boundary polygon (the polygon buffered outward by 50m). A point inside several
 overlapping buffers counts for each postcode -- the same multiplicity the
 tree-density filter uses for features near more than one postcode.
 The metric is a raw annualised count ("incidents/year within 50m"); there is no
 per-capita denominator. Outputs mirror the old LSOA transform's shape but are
 keyed on ``postcode`` instead of ``LSOA code``:
 * ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
 * ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
  nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
 Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
 points", not true locations, and a share of rows have no coordinate at all
 (dropped here). Spatial totals are therefore lower than, and fuzzier than, the
 old LSOA-tagged counts -- by design, not a regression.
 """
 from __future__ import annotations
 import argparse
 import re
 from pathlib import Path
 import numpy as np
 import polars as pl
 import shapely
 from pyproj import Transformer
 from pipeline.transform.crime import (
    MINOR_CRIME_TYPES,
    SERIOUS_CRIME_TYPES,
    find_street_crime_csvs,
 )
 from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
 # Serious types first so column order is stable and self-documenting.
 ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
 DEFAULT_BUFFER_M = 50.0
 MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
 # Generous GB bounds; points outside fall in no English postcode anyway, but
 # filtering first keeps the WGS84->BNG transform out of its undefined region.
 LON_BOUNDS = (-9.5, 2.5)
 LAT_BOUNDS = (49.0, 61.5)
 # Read CSVs in chunks of files to bound peak memory while keeping the STRtree
 # query vectorised over a useful number of points.
 _CSV_BATCH = 64
 def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
    """Derive annualisation denominators from the monthly directory names.
    Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
    month's incidents, so the set of month directories is the set of observed
    months. Returns the sorted distinct years, months-observed-per-year, and the
    total month count (the avg/yr denominator).
    """
    months = sorted(
        {path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
    )
    if not months:
        raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
    months_in_year: dict[int, int] = {}
    for month in months:
        year = int(month[:4])
        months_in_year[year] = months_in_year.get(year, 0) + 1
    years = sorted(months_in_year)
    return years, months_in_year, len(months)
 def _build_tree(
    polygons: np.ndarray, buffer_m: float
 ) -> tuple[np.ndarray, shapely.STRtree]:
    """Buffer postcode polygons outward by ``buffer_m`` and index them.
    Buffer index == postcode index. Geometries that fail to buffer are replaced
    with an empty polygon so the index stays aligned; they simply never match.
    """
    buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
    broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
    if broken.any():
        print(f"  {int(broken.sum()):,} postcode buffers unusable; left empty")
        buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
    return buffers, shapely.STRtree(buffers)
 def _accumulate_counts(
    csvs: list[Path],
    tree: shapely.STRtree,
    type_to_idx: dict[str, int],
    year_to_idx: dict[int, int],
    transformer: Transformer,
    counts: np.ndarray,
 ) -> None:
    """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
    schema = {
        "Longitude": pl.Float64,
        "Latitude": pl.Float64,
        "Month": pl.Utf8,
        "Crime type": pl.Utf8,
    }
    known_types = list(type_to_idx)
    total_points = 0
    total_matches = 0
    total_dropped = 0
    for start in range(0, len(csvs), _CSV_BATCH):
        batch = csvs[start : start + _CSV_BATCH]
        frame = (
            pl.scan_csv(
                batch,
                schema_overrides=schema,
                ignore_errors=True,
            )
            .select("Longitude", "Latitude", "Month", "Crime type")
            .with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
            .filter(
                pl.col("Longitude").is_not_null()
                & pl.col("Latitude").is_not_null()
                & pl.col("Longitude").is_between(*LON_BOUNDS)
                & pl.col("Latitude").is_between(*LAT_BOUNDS)
                & pl.col("Crime type").is_in(known_types)
                & pl.col("year").is_in(list(year_to_idx))
            )
            .with_columns(
                pl.col("Crime type")
                .replace_strict(type_to_idx, return_dtype=pl.Int32)
                .alias("tidx"),
                pl.col("year")
                .replace_strict(year_to_idx, return_dtype=pl.Int32)
                .alias("yidx"),
            )
            .select("Longitude", "Latitude", "tidx", "yidx")
            .collect(engine="streaming")
        )
        rows_in = frame.height
        if rows_in == 0:
            continue
        lon = frame["Longitude"].to_numpy()
        lat = frame["Latitude"].to_numpy()
        tidx = frame["tidx"].to_numpy()
        yidx = frame["yidx"].to_numpy()
        x, y = transformer.transform(lon, lat)
        finite = np.isfinite(x) & np.isfinite(y)
        total_dropped += int((~finite).sum())
        if not finite.any():
            continue
        x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
        total_points += x.size
        points = shapely.points(x, y)
        point_index, postcode_index = tree.query(points, predicate="intersects")
        if point_index.size:
            np.add.at(
                counts,
                (postcode_index, tidx[point_index], yidx[point_index]),
                1,
            )
            total_matches += point_index.size
        print(
            f"  files {start + len(batch):,}/{len(csvs):,}: "
            f"{total_points:,} located points, {total_matches:,} postcode matches"
        )
    if total_dropped:
        print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
 def _rollup_long(
    long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
 ) -> pl.DataFrame:
    """Sum per-year annualised counts across ``types`` into a single rollup."""
    return (
        long.filter(pl.col("Crime type").is_in(list(types)))
        .group_by("postcode", "year")
        .agg(pl.col("count").sum().round(1).alias("count"))
        .with_columns(pl.lit(rollup_name).alias("Crime type"))
        .select("postcode", "Crime type", "year", "count")
    )
 def _write_avg_yr(
    postcodes: np.ndarray,
    counts: np.ndarray,
    valid_month_count: int,
    output_path: Path,
 ) -> None:
    """Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
    totals = counts.sum(axis=2)  # (n_postcodes, n_types)
    avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)
    data: dict[str, np.ndarray] = {"postcode": postcodes}
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]
    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime averages: {output_path}")
 def _write_by_year(
    postcodes: np.ndarray,
    counts: np.ndarray,
    years: list[int],
    months_in_year: dict[int, int],
    output_path: Path,
 ) -> None:
    """Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
    months = np.array([months_in_year[year] for year in years], dtype=np.float64)
    annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)
    pc_i, ty_i, yr_i = np.nonzero(counts)
    if pc_i.size == 0:
        raise ValueError("No crime points matched any postcode buffer")
    type_names = np.array(ALL_CRIME_TYPES, dtype=object)
    year_values = np.array(years, dtype=np.int32)
    long = pl.DataFrame(
        {
            "postcode": postcodes[pc_i],
            "Crime type": type_names[ty_i],
            "year": year_values[yr_i],
            "count": annual[pc_i, ty_i, yr_i].astype(np.float32),
        }
    )
    serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
    minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
    combined = pl.concat([long, serious, minor])
    by_type = (
        combined.sort("year")
        .group_by("postcode", "Crime type")
        .agg(pl.struct("year", "count").alias("series"))
    )
    wide = by_type.pivot(on="Crime type", index="postcode", values="series")
    type_cols = [c for c in wide.columns if c != "postcode"]
    wide = wide.rename({col: f"{col} (by year)" for col in type_cols})
    output_path.parent.mkdir(parents=True, exist_ok=True)
    wide.write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime by-year series: {output_path}  {wide.shape}")
 def transform_crime_spatial(
    crime_dir: Path,
    boundaries_dir: Path,
    output_path: Path,
    by_year_output_path: Path,
    buffer_m: float = DEFAULT_BUFFER_M,
    max_postcodes: int | None = None,
    max_files: int | None = None,
 ) -> None:
    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
    if not csvs:
        raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
    if max_files is not None:
        csvs = csvs[:max_files]
    years, months_in_year, valid_month_count = _month_calendar(csvs)
    print(
        f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
        f"({years[0]}-{years[-1]})"
        + (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
    )
    postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
    print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
    _buffers, tree = _build_tree(polygons, buffer_m)
    type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
    year_to_idx = {year: idx for idx, year in enumerate(years)}
    counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    _accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
    _write_avg_yr(postcodes, counts, valid_month_count, output_path)
    _write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Count police.uk crime points within 50m of each postcode boundary"
    )
    parser.add_argument(
        "--input",
        type=Path,
        default=Path("property-data/crime"),
        help="Directory containing police.uk street crime CSVs",
    )
    parser.add_argument(
        "--boundaries",
        type=Path,
        default=Path("property-data/postcode_boundaries/units"),
        help="Directory of per-district postcode boundary GeoJSONs",
    )
    parser.add_argument(
        "--output",
        type=Path,
        required=True,
        help="Output parquet: postcode + '{type} (avg/yr)' columns",
    )
    parser.add_argument(
        "--output-by-year",
        type=Path,
        required=True,
        help="Output parquet: postcode + nested '{type} (by year)' columns",
    )
    parser.add_argument(
        "--buffer-m",
        type=float,
        default=DEFAULT_BUFFER_M,
        help="Outward buffer (metres) added to each postcode boundary",
    )
    parser.add_argument(
        "--max-postcodes",
        type=int,
        default=None,
        help="Testing only: process the first N postcodes",
    )
    parser.add_argument(
        "--max-files",
        type=int,
        default=None,
        help="Testing only: process the first N monthly CSV files",
    )
    args = parser.parse_args()
    if args.buffer_m <= 0:
        raise SystemExit("--buffer-m must be greater than zero")
    transform_crime_spatial(
        crime_dir=args.input,
        boundaries_dir=args.boundaries,
        output_path=args.output,
        by_year_output_path=args.output_by_year,
        buffer_m=args.buffer_m,
        max_postcodes=args.max_postcodes,
        max_files=args.max_files,
    )
 if __name__ == "__main__":
    main()
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -26,6 +26,7 @@ MIN_PRICE = 50_000
 EPC_SOURCE_COLUMNS = [
    "address",
    "postcode",
    "uprn",
    "current_energy_rating",
    "potential_energy_rating",
    "property_type",
@ -57,6 +58,8 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
        raw.select(
            _clean_string("address").alias("epc_address"),
            _clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
            # UPRN keys an exact listing->EPC join downstream (~99% populated).
            _clean_string("uprn").alias("uprn"),
            _clean_string("current_energy_rating")
            .str.to_uppercase()
            .alias("current_energy_rating"),
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -48,7 +48,7 @@ _AREA_COLUMNS = [
    "lon",
    # Runtime provenance for deciding whether missing coordinates are skippable.
    "ctry25cd",
-    # Keyed lookup for postcode-level side tables (e.g. crime time series).
+    # Join key for LSOA-level side tables (e.g. median age).
    "lsoa21",
    # Deprivation
    "Income Score",
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
    "Other crime (avg/yr)",
    "Serious crime (avg/yr)",
    "Minor crime (avg/yr)",
    "Serious crime per 1k residents (avg/yr)",
    "Minor crime per 1k residents (avg/yr)",
    # Amenities
    "Number of restaurants within 2km",
    "Number of grocery shops and supermarkets within 2km",
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
    "Other",
 ]
 _EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
-_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
+# Listings are matched to EPC certificates and Price-Paid properties first by
-_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
+# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
-_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
+# postcode. A house number in the listing address is the strong disambiguator,
-_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
+# so a numbered listing may match on a lower street-similarity score than a
-_PROPERTY_MATCH_MIN_MARGIN = 4.0
+# number-less one (which must match the street almost exactly to be trusted).
-_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
+_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
-_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
+_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
 _DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
 _DIRECT_EPC_NEARBY_RADIUS_M = 500.0
 _DIRECT_EPC_NEAREST_POSTCODES = 40
 _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
    ("_direct_epc_address", pl.Utf8),
    ("_direct_current_energy_rating", pl.Utf8),
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
    ("_direct_was_council_house", pl.Utf8),
    ("_direct_epc_match_status", pl.Utf8),
    ("_direct_epc_match_score", pl.Float32),
-    ("_direct_epc_match_margin", pl.Float32),
+    ("_direct_epc_match_method", pl.Utf8),
 )
 _DIRECT_EPC_RAW_COLUMN_MAP = {
    "epc_address": "_direct_epc_address",
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
    )
 def _ratio_bonus(
    left: float | int | None, right: float | int | None, pct: float, cap: float
 ) -> float:
    if left is None or right is None:
        return 0.0
    try:
        left_f = float(left)
        right_f = float(right)
    except (TypeError, ValueError):
        return 0.0
    if left_f <= 0 or right_f <= 0:
        return 0.0
    rel = abs(left_f - right_f) / max(left_f, right_f)
    if rel > pct:
        return 0.0
    return cap * (1.0 - rel / pct)
 def _rooms_bonus(left: int | None, right: int | None) -> float:
    if left is None or right is None:
        return 0.0
    try:
        diff = abs(int(left) - int(right))
    except (TypeError, ValueError):
        return 0.0
    if diff == 0:
        return 4.0
    if diff == 1:
        return 2.0
    return 0.0
 def _enum_bonus(
    left: str | None, right: str | None, *, exact: float, mismatch: float
 ) -> float:
    if not left or not right:
        return 0.0
    return exact if left == right else mismatch
 def _address_score(query: str, candidate: str | None) -> int:
    if not candidate:
        return 0
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
    return bool(address and _NUMBER_RE.search(address))
 def _normalize_uprn(value: object) -> str | None:
    """Canonical UPRN string (digits only) or None.
    UPRNs arrive as strings or ints from the scraper / EPC register; normalise
    so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
    or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
    rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
    silently mangled into a bogus all-digits key.
    """
    if value is None:
        return None
    if isinstance(value, float):
        if not value.is_integer():
            return None
        value = int(value)
    digits = re.sub(r"\D", "", str(value))
    return digits or None
 def _best_listing_match(
    listing_uprn: str | None,
    query: str | None,
    uprn_index: dict[str, dict],
    bucket_candidates: list[dict],
    addressed_fields: list[str],
 ) -> tuple[dict, float, str, str | None] | None:
    """Pick the best candidate for a listing.
    Matching is, in order: (1) an exact UPRN equality against the global
    ``uprn_index`` (postcode-independent, so it is robust even when the
    listing's postcode is slightly off); (2) failing that, the highest
    fuzzy street-address similarity within the listing's own postcode bucket.
    No property-attribute heuristics are used — a house number in the listing
    address gates the fuzzy match (`_numbers_compatible`) and lowers the score
    threshold; a number-less address must match the street almost exactly.
    ``addressed_fields`` names the candidate columns to fuzzy-match against (a
    candidate may carry both a register and an EPC address). Returns
    ``(candidate, score, method, matched_field)`` or None. ``method`` is
    "uprn" or "address"; ``matched_field`` is the winning address column (or
    None for a UPRN match).
    """
    if listing_uprn:
        hit = uprn_index.get(listing_uprn)
        if hit is not None:
            return hit, 100.0, "uprn", None
    if not query:
        return None
    listing_has_numbers = _has_number(query)
    best: dict | None = None
    best_score = 0
    best_field: str | None = None
    for candidate in bucket_candidates:
        for field in addressed_fields:
            address = candidate.get(field)
            if not address:
                continue
            if listing_has_numbers and not _numbers_compatible(query, address):
                continue
            score = _address_score(query, address)
            if score > best_score:
                best_score = score
                best = candidate
                best_field = field
    if best is None:
        return None
    threshold = (
        _LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
    )
    if best_score < threshold:
        return None
    return best, float(best_score), "address", best_field
 def _load_listings_for_merge(
    listings_path: Path, arcgis_path: Path
 ) -> pl.DataFrame:
@ -908,6 +942,20 @@ def _load_listings_for_merge(
    raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
    postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
    # UPRN is only present on scraped listings that carry it (Zoopla detail
    # pages); tolerate its absence so older parquets and test fixtures still
    # load. Digits-only so it compares equal to the EPC register's UPRN.
    if "UPRN" in raw.collect_schema().names():
        uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
        listing_uprn_expr = (
            pl.when(uprn_digits.str.len_chars() > 0)
            .then(uprn_digits)
            .otherwise(None)
            .alias("_listing_uprn")
        )
    else:
        listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
    # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
    # treats NaN as distinct from null and the downstream `latest_price /
    # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
@ -936,12 +984,14 @@ def _load_listings_for_merge(
                "postcode"
            ),
            pl.col("Address per Property Register").alias("pp_address"),
            listing_uprn_expr,
            *overlay,
        )
        .select(
            "_listing_idx",
            "postcode",
            "pp_address",
            "_listing_uprn",
            *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
        )
        .collect(engine="streaming")
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:
 def _load_direct_epc_candidates(
    epc_path: Path,
    arcgis_path: Path,
    listing_outcodes: list[str],
    temp_dir: Path,
 ) -> pl.DataFrame:
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
        "_direct_epc_match_postcode": pl.Utf8,
        "_direct_epc_outcode": pl.Utf8,
        "_direct_epc_canonical_property_type": pl.Utf8,
-        "_direct_epc_east": pl.Float64,
+        "_direct_epc_uprn": pl.Utf8,
        "_direct_epc_north": pl.Float64,
        **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
    }
    if not listing_outcodes:
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
        .with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
    )
    arcgis = pl.scan_parquet(arcgis_path).select(
        normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
        pl.col("east1m").alias("_direct_epc_east"),
        pl.col("north1m").alias("_direct_epc_north"),
    )
    return (
        epc_base.sort("inspection_date", descending=True)
        .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
            on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
            how="left",
        )
        .join(arcgis, on="_direct_epc_match_postcode", how="left")
        .with_columns(
            _canonical_epc_property_type_expr().alias(
                "_direct_epc_canonical_property_type"
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
            .otherwise(None)
            .alias("_direct_potential_energy_rating"),
            pl.col("epc_address").alias("_direct_epc_address"),
            pl.col("uprn").alias("_direct_epc_uprn"),
            pl.col("total_floor_area").alias("_direct_total_floor_area"),
            pl.col("number_habitable_rooms").alias(
                "_direct_number_habitable_rooms"
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
            "_direct_epc_match_postcode",
            "_direct_epc_outcode",
            "_direct_epc_canonical_property_type",
-            "_direct_epc_east",
+            "_direct_epc_uprn",
            "_direct_epc_north",
            "_direct_epc_address",
            "_direct_current_energy_rating",
            "_direct_potential_energy_rating",
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(
 def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
-    match = listings.with_columns(
+    """Add the normalised address/postcode/outcode keys used to match listings.
    Listings are matched to EPC certificates and properties by UPRN and by
    fuzzy street address within their (now accurate, detail-page-sourced)
    postcode — never by coordinate proximity — so no projected easting/northing
    is computed here. `_listing_uprn` flows through from the loaded listings.
    """
    return listings.with_columns(
        normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
        normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
    ).with_columns(
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
        .alias("_listing_outcode")
    )
    if match.is_empty():
        return match.with_columns(
            pl.Series("_listing_east", [], dtype=pl.Float64),
            pl.Series("_listing_north", [], dtype=pl.Float64),
        )
    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    east, north = transformer.transform(
        match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
    )
    return match.with_columns(
        pl.Series("_listing_east", east, dtype=pl.Float64),
        pl.Series("_listing_north", north, dtype=pl.Float64),
    )
 def _optional_lazy_col(
    schema: pl.Schema, column: str, dtype: pl.DataType
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
        "_matched_postcode": pl.Utf8,
        "_matched_pp_address": pl.Utf8,
        "_property_match_score": pl.Float32,
-        "_property_match_address_score": pl.Int32,
+        "_property_match_method": pl.Utf8,
        "_property_match_margin": pl.Float32,
        "_property_match_field": pl.Utf8,
    }
@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
            pl.col("postcode").cast(pl.Utf8).alias("postcode"),
            pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
            _optional_lazy_col(schema, "epc_address", pl.Utf8),
-            _optional_lazy_col(schema, "pp_property_type", pl.Utf8),
+            # UPRN keys the exact match; present once epc_pp is rebuilt with it.
-            _optional_lazy_col(schema, "duration", pl.Utf8),
+            _optional_lazy_col(schema, "uprn", pl.Utf8),
            _optional_lazy_col(schema, "total_floor_area", pl.Float64),
            _optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
            _optional_lazy_col(schema, "latest_price", pl.Int64),
        )
        .with_row_index("_property_row")
        .with_columns(
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
    )
-def _property_candidates_by_postcode(
+def _index_candidates(
-    candidates: pl.DataFrame,
+    candidates: pl.DataFrame, postcode_key: str, uprn_key: str
-) -> dict[str, list[dict]]:
+) -> tuple[dict[str, list[dict]], dict[str, dict]]:
    """Index candidate rows for matching, in a single pass over the frame.
    Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
    fuzzy street-address match; the UPRN index drives the exact match and is
    postcode-independent, so it still resolves when a listing's postcode is
    slightly off.
    """
    buckets: dict[str, list[dict]] = {}
    uprn_index: dict[str, dict] = {}
    for row in candidates.iter_rows(named=True):
-        postcode = row.get("_property_match_postcode")
+        postcode = row.get(postcode_key)
        if postcode:
            buckets.setdefault(postcode, []).append(row)
-    return buckets
+        uprn = _normalize_uprn(row.get(uprn_key))
        if uprn and uprn not in uprn_index:
            uprn_index[uprn] = row
    return buckets, uprn_index
 def _best_listing_property_candidate(
-    listing: dict, candidates: list[dict]
+    listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
 ) -> dict | None:
-    query = listing.get("_listing_match_address")
+    result = _best_listing_match(
-    if not query:
+        listing.get("_listing_uprn"),
-        return None
+        listing.get("_listing_match_address"),
-
+        uprn_index,
-    listing_has_numbers = _has_number(query)
+        candidates,
-    scored: list[tuple[float, int, dict, str]] = []
+        ["_property_match_address", "_property_epc_match_address"],
    for candidate in candidates:
        register_address = candidate.get("_property_match_address")
        epc_address = candidate.get("_property_epc_match_address")
        register_numbers_compatible = bool(
            register_address and _numbers_compatible(query, register_address)
        )
        epc_numbers_compatible = bool(
            epc_address and _numbers_compatible(query, epc_address)
        )
        if not (register_numbers_compatible or epc_numbers_compatible):
            continue
        register_score = _address_score(query, register_address)
        epc_score = _address_score(query, epc_address)
        base_score = max(register_score, epc_score)
        if base_score == 0:
            continue
        score = float(base_score)
        score += _enum_bonus(
            listing.get("_actual_property_type"),
            candidate.get("pp_property_type"),
            exact=7.0,
            mismatch=-8.0,
        )
        score += _enum_bonus(
            listing.get("_actual_leasehold_freehold"),
            candidate.get("duration"),
            exact=3.0,
            mismatch=-3.0,
        )
        score += _ratio_bonus(
            listing.get("_actual_total_floor_area"),
            candidate.get("total_floor_area"),
            pct=0.15,
            cap=8.0,
        )
        score += _rooms_bonus(
            listing.get("_actual_number_habitable_rooms"),
            candidate.get("number_habitable_rooms"),
        )
        score += _ratio_bonus(
            listing.get("_actual_asking_price"),
            candidate.get("latest_price"),
            pct=0.25,
            cap=3.0,
        )
        matched_field = (
            "pp_address" if register_score >= epc_score else "epc_address"
        )
        scored.append((score, base_score, candidate, matched_field))
    if not scored:
        return None
    scored.sort(key=lambda item: item[0], reverse=True)
    top = scored[0]
    runner_up = scored[1][0] if len(scored) > 1 else None
    margin = top[0] - runner_up if runner_up is not None else top[0]
    score_threshold = (
        _PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
    )
-    address_threshold = (
+    if result is None:
        _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
    )
    if (
        top[0] < score_threshold
        or top[1] < address_threshold
        or margin < _PROPERTY_MATCH_MIN_MARGIN
    ):
        return None
-
+    candidate, score, method, field = result
-    candidate = top[2]
+    matched_field = {
        "_property_match_address": "pp_address",
        "_property_epc_match_address": "epc_address",
    }.get(field, method)
    return {
        "_listing_idx": listing["_listing_idx"],
        "_matched_postcode": candidate.get("postcode"),
        "_matched_pp_address": candidate.get("pp_address"),
-        "_property_match_score": round(top[0], 1),
+        "_property_match_score": round(score, 1),
-        "_property_match_address_score": top[1],
+        "_property_match_method": method,
-        "_property_match_margin": round(margin, 1),
+        "_property_match_field": matched_field,
        "_property_match_field": top[3],
    }
@ -1280,23 +1251,32 @@ def _match_listing_properties(
    if listing_matches.is_empty() or property_candidates.is_empty():
        return _empty_listing_property_matches()
-    buckets = _property_candidates_by_postcode(property_candidates)
+    buckets, uprn_index = _index_candidates(
        property_candidates, "_property_match_postcode", "uprn"
    )
    best_matches = []
    for listing in listing_matches.iter_rows(named=True):
        postcode = listing.get("_listing_match_postcode")
-        if not postcode:
+        bucket = buckets.get(postcode, []) if postcode else []
-            continue
+        match = _best_listing_property_candidate(listing, uprn_index, bucket)
        match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
        if match is not None:
            best_matches.append(match)
    if not best_matches:
        return _empty_listing_property_matches()
    # When two listings claim the same property, keep the most authoritative
    # match: an exact UPRN match always wins over a fuzzy address match (both can
    # score 100, so method must break the tie before score and listing index).
    matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
    return (
        matches.sort(
-            ["_property_match_score", "_listing_idx"], descending=[True, False]
+            [
                pl.col("_property_match_method") == "uprn",
                "_property_match_score",
                "_listing_idx",
            ],
            descending=[True, True, False],
        )
        .unique(
            ["_matched_postcode", "_matched_pp_address"],
@ -1307,133 +1287,19 @@ def _match_listing_properties(
    )
-def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
+def _best_direct_epc_candidate(
-    buckets: dict[str, list[dict]] = {}
+    listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
-    for row in candidates.iter_rows(named=True):
+) -> dict | None:
-        postcode = row.get("_direct_epc_match_postcode")
+    result = _best_listing_match(
-        if postcode:
+        listing.get("_listing_uprn"),
-            buckets.setdefault(postcode, []).append(row)
+        listing.get("_listing_match_address"),
-    return buckets
+        uprn_index,
-
+        candidates,
-
+        ["_direct_epc_match_address"],
 def _epc_postcode_tree(
    candidates: pl.DataFrame,
 ) -> tuple[cKDTree | None, list[str]]:
    postcode_points = (
        candidates.select(
            "_direct_epc_match_postcode",
            "_direct_epc_east",
            "_direct_epc_north",
        )
        .drop_nulls()
        .filter(
            pl.col("_direct_epc_east").is_finite()
            & pl.col("_direct_epc_north").is_finite()
        )
        .unique("_direct_epc_match_postcode")
    )
-    if postcode_points.is_empty():
+    if result is None:
        return None, []
    coords = np.column_stack(
        [
            postcode_points["_direct_epc_east"].to_numpy(),
            postcode_points["_direct_epc_north"].to_numpy(),
        ]
    )
    return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
 def _candidate_postcodes_for_listing(
    listing: dict,
    postcode_tree: cKDTree | None,
    postcode_values: list[str],
 ) -> list[str]:
    postcodes: list[str] = []
    exact = listing.get("_listing_match_postcode")
    if exact:
        postcodes.append(exact)
    if postcode_tree is None:
        return postcodes
    east = listing.get("_listing_east")
    north = listing.get("_listing_north")
    try:
        east_f = float(east)
        north_f = float(north)
    except (TypeError, ValueError):
        return postcodes
    if not np.isfinite(east_f) or not np.isfinite(north_f):
        return postcodes
    k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
    distances, indices = postcode_tree.query(
        [east_f, north_f],
        k=k,
        distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
    )
    distances = np.atleast_1d(distances)
    indices = np.atleast_1d(indices)
    seen = set(postcodes)
    for distance, idx in zip(distances, indices, strict=False):
        if not np.isfinite(distance) or idx >= len(postcode_values):
            continue
        postcode = postcode_values[int(idx)]
        if postcode not in seen:
            postcodes.append(postcode)
            seen.add(postcode)
    return postcodes
 def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
    query = listing.get("_listing_match_address")
    if not query:
        return None
-
+    candidate, score, method, _field = result
    listing_has_numbers = _has_number(query)
    scored: list[tuple[float, int, dict]] = []
    for candidate in candidates:
        address = candidate.get("_direct_epc_match_address")
        if listing_has_numbers and not _numbers_compatible(query, address or ""):
            continue
        base_score = _address_score(query, address)
        if base_score == 0:
            continue
        score = float(base_score)
        score += _enum_bonus(
            listing.get("_actual_property_type"),
            candidate.get("_direct_epc_canonical_property_type"),
            exact=6.0,
            mismatch=-6.0,
        )
        score += _ratio_bonus(
            listing.get("_actual_total_floor_area"),
            candidate.get("_direct_total_floor_area"),
            pct=0.12,
            cap=8.0,
        )
        score += _rooms_bonus(
            listing.get("_actual_number_habitable_rooms"),
            candidate.get("_direct_number_habitable_rooms"),
        )
        scored.append((score, base_score, candidate))
    if not scored:
        return None
    scored.sort(key=lambda item: item[0], reverse=True)
    top = scored[0]
    runner_up = scored[1][0] if len(scored) > 1 else None
    margin = top[0] - runner_up if runner_up is not None else top[0]
    threshold = (
        _DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
    )
    if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
        return None
    candidate = top[2]
    return {
        "_listing_idx": listing["_listing_idx"],
        "_direct_epc_address": candidate.get("_direct_epc_address"),
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
        ),
        "_direct_was_council_house": candidate.get("_direct_was_council_house"),
        "_direct_epc_match_status": "matched",
-        "_direct_epc_match_score": round(top[0], 1),
+        "_direct_epc_match_score": round(score, 1),
-        "_direct_epc_match_margin": round(margin, 1),
+        "_direct_epc_match_method": method,
    }
@ -1463,25 +1329,14 @@ def _match_direct_epc(
    if listing_matches.is_empty() or epc_candidates.is_empty():
        return _empty_direct_epc_matches()
-    buckets = _epc_candidates_by_postcode(epc_candidates)
+    buckets, uprn_index = _index_candidates(
-    postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
+        epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
-
+    )
    matches = []
    for listing in listing_matches.iter_rows(named=True):
-        candidate_postcodes = _candidate_postcodes_for_listing(
+        postcode = listing.get("_listing_match_postcode")
-            listing, postcode_tree, postcode_values
+        bucket = buckets.get(postcode, []) if postcode else []
-        )
+        match = _best_direct_epc_candidate(listing, uprn_index, bucket)
        candidate_rows: list[dict] = []
        seen_rows: set[int] = set()
        for postcode in candidate_postcodes:
            for candidate in buckets.get(postcode, []):
                row = candidate.get("_direct_epc_row")
                if row in seen_rows:
                    continue
                candidate_rows.append(candidate)
                if row is not None:
                    seen_rows.add(row)
        match = _best_direct_epc_candidate(listing, candidate_rows)
        if match is not None:
            matches.append(match)
@ -1493,7 +1348,6 @@ def _match_direct_epc(
 def _enrich_listings_with_direct_epc(
    listings: pl.DataFrame,
    epc_path: Path | None,
    arcgis_path: Path,
 ) -> pl.DataFrame:
    if epc_path is None:
        return _ensure_direct_epc_columns(listings)
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
        prefix="direct_listing_epc_", dir=local_tmp_dir()
    ) as tmpdir:
        epc_candidates = _load_direct_epc_candidates(
-            epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
+            epc_path, listing_outcodes, Path(tmpdir)
        )
        print(f"Direct listing EPC candidates: {epc_candidates.height}")
        direct_matches = _match_direct_epc(listing_matches, epc_candidates)
@ -1604,7 +1458,7 @@ def _integrate_listings(
    """
    listings = _load_listings_for_merge(listings_path, arcgis_path)
    print(f"Listings loaded: {listings.height}")
-    listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
+    listings = _enrich_listings_with_direct_epc(listings, epc_path)
    overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
    listing_attachment_columns = [
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
    """Project the post-rename wide frame down to enriched-listing rows."""
    df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
    # A matched listing's overlay attaches to every wide row sharing its
    # (postcode, pp_address). The terminated-postcode remap can collapse several
    # distinct wide rows onto one such key, which would otherwise emit one duplicate
    # listing per collapsed row. Each listing matches exactly one (postcode,
    # pp_address) and each seed row carries a unique URL, so keeping a single row per
    # listing URL collapses only that fan-out and never merges distinct listings.
    df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
    df = df.with_columns(
        pl.col("_actual_listing_url").alias("Listing URL"),
        pl.col("_actual_listing_date").alias("Listing date"),
@ -1750,7 +1612,6 @@ def _build(
    broadband_path: Path,
    conservation_areas_path: Path,
    rental_prices_path: Path,
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
    tree_density_postcodes_path: Path | None = None,
@ -1881,8 +1742,10 @@ def _build(
        how="left",
    )
    # Crime is counted spatially per postcode (incidents within 50m of the
    # postcode boundary), so it joins on postcode rather than LSOA.
    crime = pl.scan_parquet(crime_path)
-    wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
+    wide = wide.join(crime, on="postcode", how="left")
    wide = wide.with_columns(
        pl.sum_horizontal(
@ -1905,17 +1768,6 @@ def _build(
        ).alias("minor_crime_avg_yr"),
    )
    lsoa_pop = pl.scan_parquet(lsoa_population_path)
    wide = wide.join(lsoa_pop, on="lsoa21", how="left")
    wide = wide.with_columns(
        pl.when(pl.col("population") > 0)
        .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("serious_crime_per_1k"),
        pl.when(pl.col("population") > 0)
        .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("minor_crime_per_1k"),
    ).drop("population")
    median_age = pl.scan_parquet(median_age_path)
    wide = wide.join(median_age, on="lsoa21", how="left")
@ -2082,8 +1934,6 @@ def _build(
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
                "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
                "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
                "mean_monthly_rent": "Estimated monthly rent",
                "floor_height": "Interior height (m)",
                "was_council_house": "Former council house",
@ -2189,12 +2039,6 @@ def main():
        required=True,
        help="ONS rental prices by LA and bedroom count parquet file",
    )
    parser.add_argument(
        "--lsoa-population",
        type=Path,
        required=True,
        help="Census 2021 population by LSOA parquet file",
    )
    parser.add_argument(
        "--median-age",
        type=Path,
@ -2279,7 +2123,6 @@ def main():
        broadband_path=args.broadband,
        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
        tree_density_postcodes_path=args.tree_density_postcodes,
--- a/pipeline/transform/noise_overlay_tiles.py
+++ b/pipeline/transform/noise_overlay_tiles.py
@ -376,7 +376,7 @@ def main() -> None:
        "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
    )
    parser.add_argument("--pmtiles-version", default="1.22.3")
-    parser.add_argument("--min-zoom", type=int, default=13)
+    parser.add_argument("--min-zoom", type=int, default=12)
    parser.add_argument("--max-zoom", type=int, default=14)
    parser.add_argument("--tile-size", type=int, default=256)
    args = parser.parse_args()
--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -22,6 +22,12 @@ def main() -> None:
        description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
    )
    parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
    parser.add_argument(
        "--arcgis",
        type=Path,
        default=None,
        help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
    )
    parser.add_argument(
        "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
    )
@ -46,7 +52,7 @@ def main() -> None:
    print("=" * 60)
    oa_geoms = load_oa_boundaries(args.oa_boundaries)
-    uprn_df, uprn_offsets = load_uprns(args.uprn)
+    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
    # Phase 2: Parse/load INSPIRE
    print()
--- a/pipeline/transform/postcode_boundaries/loader.py
+++ b/pipeline/transform/postcode_boundaries/loader.py
@ -0,0 +1,105 @@
 """Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
 The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
 postcode district under ``units/{district}.geojson``, each feature carrying a
 ``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
 that test points against postcode geometry want those polygons back in British
 National Grid (EPSG:27700) so buffers/distances are in metres.
 :func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs
 invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
 postcode so callers can use the array index as a stable postcode id -- the same
 "buffer index == postcode index" convention used by ``tree_density``.
 """
 from __future__ import annotations
 import json
 from pathlib import Path
 import numpy as np
 import shapely
 from pyproj import Transformer
 def _read_district(
    path: Path, transformer: Transformer
 ) -> tuple[np.ndarray, np.ndarray]:
    """Return (postcodes, polygons_27700) for one district GeoJSON."""
    with path.open() as file:
        collection = json.load(file)
    features = collection.get("features", [])
    if not features:
        return np.empty(0, dtype=object), np.empty(0, dtype=object)
    postcodes = np.array(
        [feature["properties"]["postcodes"] for feature in features], dtype=object
    )
    geom_json = np.array(
        [json.dumps(feature["geometry"]) for feature in features], dtype=object
    )
    geoms = shapely.from_geojson(geom_json)
    # Reproject every vertex in a single pyproj call, then rebuild the polygons.
    coords = shapely.get_coordinates(geoms)
    if coords.size:
        x, y = transformer.transform(coords[:, 0], coords[:, 1])
        geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
    invalid = ~shapely.is_valid(geoms)
    if invalid.any():
        geoms[invalid] = shapely.make_valid(geoms[invalid])
    return postcodes, geoms
 def load_postcode_polygons(
    units_dir: Path, max_postcodes: int | None = None
 ) -> tuple[np.ndarray, np.ndarray]:
    """Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
    Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
    ``max_postcodes`` (testing) keeps only the lexicographically-first N
    postcodes, reading just enough district files to reach the cap.
    """
    units_dir = Path(units_dir)
    files = sorted(units_dir.glob("*.geojson"))
    if not files:
        raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    postcode_chunks: list[np.ndarray] = []
    geom_chunks: list[np.ndarray] = []
    total = 0
    for path in files:
        postcodes, geoms = _read_district(path, transformer)
        if len(postcodes) == 0:
            continue
        postcode_chunks.append(postcodes)
        geom_chunks.append(geoms)
        total += len(postcodes)
        if max_postcodes is not None and total >= max_postcodes:
            break
    if not postcode_chunks:
        raise ValueError(f"No postcode features found in {units_dir}")
    postcodes = np.concatenate(postcode_chunks)
    geoms = np.concatenate(geom_chunks)
    # Stable postcode order makes "index == postcode id" deterministic; dedupe
    # defensively (a postcode lives in exactly one district file).
    order = np.argsort(postcodes, kind="stable")
    postcodes = postcodes[order]
    geoms = geoms[order]
    _, first = np.unique(postcodes, return_index=True)
    postcodes = postcodes[first]
    geoms = geoms[first]
    if max_postcodes is not None and len(postcodes) > max_postcodes:
        postcodes = postcodes[:max_postcodes]
        geoms = geoms[:max_postcodes]
    print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
    return postcodes, geoms
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
        loaded_df, _ = load_uprns(path)
        assert len(loaded_df) == 0
    def test_non_english_oas_excluded(self, tmp_path):
        df = pl.DataFrame(
            {
                "GRIDGB1E": [500010, 300010],
                "GRIDGB1N": [180010, 220010],
                "PCDS": ["AA1 1AA", "CF1 1AA"],
                "OA21CD": ["E00000001", "W00000001"],
            }
        )
        path = tmp_path / "uprn.parquet"
        df.write_parquet(path)
        loaded_df, offsets = load_uprns(path)
        assert set(offsets) == {"E00000001"}
        assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
    def test_terminated_postcodes_are_remapped(self, tmp_path):
        uprns = pl.DataFrame(
            {
                "GRIDGB1E": [500010],
                "GRIDGB1N": [180010],
                "PCDS": ["aa1 1aa"],
                "OA21CD": ["E00000001"],
            }
        )
        uprn_path = tmp_path / "uprn.parquet"
        uprns.write_parquet(uprn_path)
        arcgis = pl.DataFrame(
            {
                "pcds": ["AA1 1AA", "AA1 1AB"],
                "east1m": [500010, 500030],
                "north1m": [180010, 180020],
                "doterm": ["2020-01-01", None],
                "ctry25cd": ["E92000001", "E92000001"],
            }
        )
        arcgis_path = tmp_path / "arcgis.parquet"
        arcgis.write_parquet(arcgis_path)
        loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
        assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
 # ---------------------------------------------------------------------------
 # Bug 3: Voronoi deduplication is first-seen-wins
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -4,11 +4,18 @@ import numpy as np
 import polars as pl
 from pipeline.local_temp import local_tmp_dir
 from pipeline.utils.postcode_mapping import build_postcode_mapping
 from .memory import release_memory
-def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
+def _canonical_postcode_expr(name: str) -> pl.Expr:
    return pl.col(name).str.strip_chars().str.to_uppercase()
 def load_uprns(
    uprn_path: Path, arcgis_path: Path | None = None
 ) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.
    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
    import tempfile
    print("Loading UPRN lookup...")
    mapping = None
    if arcgis_path is not None:
        mapping = (
            build_postcode_mapping(arcgis_path)
            .with_columns(
                _canonical_postcode_expr("old_postcode").alias("old_postcode"),
                _canonical_postcode_expr("new_postcode").alias("new_postcode"),
            )
            .unique("old_postcode")
        )
    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
    with tempfile.NamedTemporaryFile(
        suffix=".parquet", delete=False, dir=local_tmp_dir()
    ) as tmp:
        tmp_path = Path(tmp.name)
-    (
+    uprns = (
        pl.scan_parquet(uprn_path)
        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
-        .filter(~pl.col("OA21CD").str.starts_with("S"))
+        .filter(pl.col("OA21CD").str.starts_with("E"))
        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
-        .with_columns(pl.col("PCDS").str.strip_chars())
+        .with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
        .sort("OA21CD")
        .sink_parquet(tmp_path)
    )
    if mapping is not None and mapping.height > 0:
        uprns = (
            uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
            .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
            .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
        )
    uprns.sort("OA21CD").sink_parquet(tmp_path)
    release_memory()
    # Read the sorted data — only one copy in memory (~2GB)
    df = pl.read_parquet(tmp_path)
    tmp_path.unlink()
    n = len(df)
-    print(f"  Loaded {n:,} UPRNs (England & Wales)")
+    print(f"  Loaded {n:,} UPRNs (England)")
    # Compute OA group offsets using polars (avoids 37M Python string creation)
    boundary_df = (
--- a/pipeline/transform/property_border_tiles.py
+++ b/pipeline/transform/property_border_tiles.py
@ -0,0 +1,138 @@
 """Build PMTiles polygon tiles for the INSPIRE property-border overlay.
 Reads the HM Land Registry INSPIRE Index Polygons (per-local-authority GML ZIPs
 in EPSG:27700), reprojects each parcel to WGS84, and tiles the outlines with
 tippecanoe. The dashboard serves the resulting archive through
 ``/api/overlays/property-borders`` and renders it as thin outlines only at the
 postcode zoom level.
 The same ZIPs are already downloaded for postcode-boundary generation; this
 target re-uses :func:`parse_inspire_zip` to stay self-contained and is wired to
 the ``$(INSPIRE_STAMP)`` make dependency rather than the boundary cache.
 Data: HM Land Registry INSPIRE Index Polygons, Open Government Licence v3.0.
 Boundaries are indicative "general boundaries", not the legal extent of title.
 """
 from __future__ import annotations
 import argparse
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 import numpy as np
 import shapely
 from pyproj import Transformer
 from shapely.geometry import Polygon
 from tqdm import tqdm
 from pipeline.local_temp import local_tmp_dir
 from pipeline.transform.postcode_boundaries.inspire import parse_inspire_zip
 def _require_tippecanoe() -> str:
    executable = shutil.which("tippecanoe")
    if executable is None:
        raise RuntimeError(
            "tippecanoe is required to build property border PMTiles. "
            "Install tippecanoe and rerun this target."
        )
    return executable
 def _write_property_geojsonseq(inspire_dir: Path, output_path: Path) -> int:
    """Stream INSPIRE parcels to a WGS84 GeoJSONSeq file, one feature per line.
    Features carry no properties — the overlay only draws outlines, so dropping
    attributes keeps the tiles as small as possible. Reprojection and GeoJSON
    encoding are vectorised per ZIP (one local authority) to bound memory while
    staying in shapely's C path.
    """
    to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
    zip_files = sorted(inspire_dir.glob("*.zip"))
    if not zip_files:
        raise RuntimeError(f"No INSPIRE ZIP files found in {inspire_dir}")
    feature_count = 0
    with output_path.open("w") as file:
        for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
            rings = parse_inspire_zip(zip_path)  # list of Nx2 (easting, northing)
            if not rings:
                continue
            geoms = np.array([Polygon(coords) for coords in rings], dtype=object)
            # interleaved=False → transform(x, y) called once with full arrays.
            geoms = shapely.transform(geoms, to_wgs84.transform, interleaved=False)
            for geometry_json in shapely.to_geojson(geoms):
                file.write('{"type":"Feature","properties":{},"geometry":')
                file.write(geometry_json)
                file.write("}\n")
                feature_count += 1
    return feature_count
 def build_property_border_tiles(
    inspire_dir: Path,
    output_path: Path,
    min_zoom: int,
    max_zoom: int,
 ) -> None:
    tippecanoe = _require_tippecanoe()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
        ndjson_path = Path(tmp) / "property_borders.geojsonseq"
        feature_count = _write_property_geojsonseq(inspire_dir, ndjson_path)
        print(f"Writing {feature_count:,} INSPIRE parcel polygons")
        subprocess.run(
            [
                tippecanoe,
                "--force",
                "--output",
                str(output_path),
                "--layer",
                "property_borders",
                "--minimum-zoom",
                str(min_zoom),
                "--maximum-zoom",
                str(max_zoom),
                # Borders are only meaningful at street level; thin the densest
                # tiles at low zoom but keep full geometry at max zoom.
                "--drop-smallest-as-needed",
                "--simplify-only-low-zooms",
                "--extend-zooms-if-still-dropping",
                "--temporary-directory",
                tmp,
                str(ndjson_path),
            ],
            check=True,
        )
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output .pmtiles path"
    )
    parser.add_argument("--min-zoom", type=int, default=12)
    parser.add_argument("--max-zoom", type=int, default=16)
    args = parser.parse_args()
    build_property_border_tiles(
        inspire_dir=args.inspire,
        output_path=args.output,
        min_zoom=args.min_zoom,
        max_zoom=args.max_zoom,
    )
 if __name__ == "__main__":
    main()
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -0,0 +1,147 @@
 import json
 import polars as pl
 from pyproj import Transformer
 from pipeline.transform.crime_spatial import transform_crime_spatial
 _TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
 _CSV_HEADER = (
    "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
    "LSOA code,LSOA name,Crime type,Last outcome category,Context"
 )
 def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]:
    lon, lat = _TO_WGS84.transform(x, y)
    return lon, lat
 def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict:
    ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)]
    coords = [list(_bng_to_wgs84(x, y)) for x, y in ring]
    return {
        "type": "Feature",
        "properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")},
        "geometry": {"type": "Polygon", "coordinates": [coords]},
    }
 def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None:
    units_dir.mkdir(parents=True)
    for district, features in features_by_district.items():
        collection = {"type": "FeatureCollection", "features": features}
        (units_dir / f"{district}.geojson").write_text(json.dumps(collection))
 def _crime_row(month: str, x, y, crime_type: str) -> str:
    if x is None or y is None:
        lon, lat = "", ""
    else:
        lon, lat = _bng_to_wgs84(x, y)
    return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
 def _write_month(crime_dir, month: str, rows: list[str]) -> None:
    month_dir = crime_dir / month
    month_dir.mkdir(parents=True)
    body = "\n".join([_CSV_HEADER, *rows]) + "\n"
    (month_dir / f"{month}-test-force-street.csv").write_text(body)
 def test_buffer_overlap_counts_for_each_postcode(tmp_path):
    units = tmp_path / "units"
    # A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060].
    _write_boundaries(
        units,
        {
            "AB1": [
                _square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
                _square_feature("AB1 1AB", 1080, 1000, 1090, 1010),
                _square_feature("AB1 1AC", 5000, 5000, 5010, 5010),
            ]
        },
    )
    crime = tmp_path / "crime"
    _write_month(
        crime,
        "2024-01",
        [
            # In the overlap: 35m east of A, 35m west of B -> counts for both.
            _crime_row("2024-01", 1045, 1005, "Burglary"),
            # 49m east of C's edge -> inside C's buffer.
            _crime_row("2024-01", 5059, 5005, "Robbery"),
            # 51m east of C's edge -> outside every buffer.
            _crime_row("2024-01", 5061, 5005, "Robbery"),
            # No coordinate -> dropped entirely.
            _crime_row("2024-01", None, None, "Anti-social behaviour"),
        ],
    )
    output = tmp_path / "crime_by_postcode.parquet"
    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
    transform_crime_spatial(crime, units, output, by_year)
    rows = {
        r["postcode"]: r
        for r in pl.read_parquet(output).to_dicts()
    }
    # Single month -> annualised x12.
    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
    assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
    assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
    # Only the 49m robbery counts for C; the 51m one and the blank row do not.
    assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0
    assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0
    # Anti-social behaviour had no coordinate -> nobody gets it.
    assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values())
 def test_by_year_annualises_and_rolls_up(tmp_path):
    units = tmp_path / "units"
    _write_boundaries(
        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
    )
    crime = tmp_path / "crime"
    # Point at the centre of AB1 1AA, well inside its buffer.
    _write_month(
        crime,
        "2023-01",
        [
            _crime_row("2023-01", 1005, 1005, "Burglary"),
            _crime_row("2023-01", 1005, 1005, "Robbery"),
        ],
    )
    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
    _write_month(
        crime,
        "2024-02",
        [
            _crime_row("2024-02", 1005, 1005, "Burglary"),
            _crime_row("2024-02", 1005, 1005, "Anti-social behaviour"),
        ],
    )
    output = tmp_path / "crime_by_postcode.parquet"
    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
    transform_crime_spatial(crime, units, output, by_year)
    by_year_df = pl.read_parquet(by_year)
    assert by_year_df.height == 1
    cols = set(by_year_df.columns)
    assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols
    row = by_year_df.row(0, named=True)
    burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
    # 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
    assert burglary == [
        {"year": 2023, "count": 12.0},
        {"year": 2024, "count": 12.0},
    ]
    serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
    # 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
    assert serious[2023] == 24.0
    assert serious[2024] == 12.0
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -24,6 +24,7 @@ def _row(**overrides: str) -> dict[str, str]:
    row = {
        "address": "1 Example Street",
        "postcode": " aa1 1aa ",
        "uprn": "100012345678",
        "current_energy_rating": "c",
        "potential_energy_rating": "b",
        "property_type": "House",
@ -52,6 +53,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
        {
            "epc_address": "1 Example Street",
            "epc_postcode": "AA1 1AA",
            "uprn": "100012345678",
            "current_energy_rating": "C",
            "potential_energy_rating": "B",
            "epc_property_type": "House",
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -15,6 +15,8 @@ from pipeline.transform.merge import (
    _finalize_listings,
    _integrate_listings,
    _match_direct_epc,
    _match_listing_properties,
    _normalize_uprn,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
    _load_conservation_area_geometries,
@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None:
    assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
 def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
    # Crime is now a raw spatial count per postcode; the per-1k-residents
    # variants were dropped along with the LSOA population denominator.
    assert "Serious crime (avg/yr)" in _AREA_COLUMNS
    assert "Minor crime (avg/yr)" in _AREA_COLUMNS
    assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
    assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
 def test_listed_building_feature_is_property_level() -> None:
    assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
    assert seed["was_council_house"].to_list() == ["No"]
-def test_match_direct_epc_considers_nearby_postcodes() -> None:
+_DIRECT_EPC_CANDIDATE_SCHEMA = {
-    listing_matches = pl.DataFrame(
+    "_direct_epc_row": pl.UInt32,
-        {
+    "_direct_epc_match_address": pl.Utf8,
-            "_listing_idx": [0],
+    "_direct_epc_match_postcode": pl.Utf8,
-            "_listing_match_address": ["1 EXAMPLE ROAD"],
+    "_direct_epc_outcode": pl.Utf8,
-            "_listing_match_postcode": ["AA11AA"],
+    "_direct_epc_canonical_property_type": pl.Utf8,
-            "_listing_east": [1000.0],
+    "_direct_epc_uprn": pl.Utf8,
-            "_listing_north": [1000.0],
+    "_direct_epc_address": pl.Utf8,
-            "_actual_property_type": ["Terraced"],
+    "_direct_current_energy_rating": pl.Utf8,
-            "_actual_total_floor_area": [100.0],
+    "_direct_potential_energy_rating": pl.Utf8,
-            "_actual_number_habitable_rooms": [4],
+    "_direct_total_floor_area": pl.Float64,
-        },
+    "_direct_number_habitable_rooms": pl.Int16,
-        schema={
+    "_direct_floor_height": pl.Float64,
-            "_listing_idx": pl.UInt32,
+    "_direct_construction_age_band": pl.UInt16,
-            "_listing_match_address": pl.Utf8,
+    "_direct_is_construction_date_approximate": pl.UInt8,
-            "_listing_match_postcode": pl.Utf8,
+    "_direct_was_council_house": pl.Utf8,
-            "_listing_east": pl.Float64,
+}
-            "_listing_north": pl.Float64,
+
-            "_actual_property_type": pl.Utf8,
+_LISTING_MATCH_SCHEMA = {
-            "_actual_total_floor_area": pl.Float64,
+    "_listing_idx": pl.UInt32,
-            "_actual_number_habitable_rooms": pl.Int16,
+    "_listing_match_address": pl.Utf8,
-        },
+    "_listing_match_postcode": pl.Utf8,
-    )
+    "_listing_uprn": pl.Utf8,
-    epc_candidates = pl.DataFrame(
+}
-        {
+
-            "_direct_epc_row": [0],
+
-            "_direct_epc_match_address": ["1 EXAMPLE ROAD"],
+def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
-            "_direct_epc_match_postcode": ["BB11BB"],
+    base = {
-            "_direct_epc_east": [1020.0],
+        "_direct_epc_row": 0,
-            "_direct_epc_north": [1010.0],
+        "_direct_epc_match_address": "1 EXAMPLE ROAD",
-            "_direct_epc_canonical_property_type": ["Terraced"],
+        "_direct_epc_match_postcode": "AA11AA",
-            "_direct_epc_address": ["1, Example Road"],
+        "_direct_epc_outcode": "AA1",
-            "_direct_current_energy_rating": ["C"],
+        "_direct_epc_canonical_property_type": "Terraced",
-            "_direct_potential_energy_rating": ["B"],
+        "_direct_epc_uprn": None,
-            "_direct_total_floor_area": [101.0],
+        "_direct_epc_address": "1, Example Road",
-            "_direct_number_habitable_rooms": [4],
+        "_direct_current_energy_rating": "C",
-            "_direct_floor_height": [2.5],
+        "_direct_potential_energy_rating": "B",
-            "_direct_construction_age_band": [1930],
+        "_direct_total_floor_area": 101.0,
-            "_direct_is_construction_date_approximate": [1],
+        "_direct_number_habitable_rooms": 4,
-            "_direct_was_council_house": ["No"],
+        "_direct_floor_height": 2.5,
-        },
+        "_direct_construction_age_band": 1930,
-        schema={
+        "_direct_is_construction_date_approximate": 1,
-            "_direct_epc_row": pl.UInt32,
+        "_direct_was_council_house": "No",
-            "_direct_epc_match_address": pl.Utf8,
+    }
-            "_direct_epc_match_postcode": pl.Utf8,
+    return pl.DataFrame(
-            "_direct_epc_east": pl.Float64,
+        [{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
            "_direct_epc_north": pl.Float64,
            "_direct_epc_canonical_property_type": pl.Utf8,
            "_direct_epc_address": pl.Utf8,
            "_direct_current_energy_rating": pl.Utf8,
            "_direct_potential_energy_rating": pl.Utf8,
            "_direct_total_floor_area": pl.Float64,
            "_direct_number_habitable_rooms": pl.Int16,
            "_direct_floor_height": pl.Float64,
            "_direct_construction_age_band": pl.UInt16,
            "_direct_is_construction_date_approximate": pl.UInt8,
            "_direct_was_council_house": pl.Utf8,
        },
    )
-    matches = _match_direct_epc(listing_matches, epc_candidates)
+
 def _listing_matches(rows: list[dict]) -> pl.DataFrame:
    base = {
        "_listing_idx": 0,
        "_listing_match_address": "1 EXAMPLE ROAD",
        "_listing_match_postcode": "AA11AA",
        "_listing_uprn": None,
    }
    return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
 def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
    # UPRN is matched globally (not within a postcode bucket), so a listing
    # whose detail-page postcode is slightly off still resolves to the right
    # EPC certificate by its UPRN.
    matches = _match_direct_epc(
        _listing_matches(
            [{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
        ),
        _direct_epc_candidates(
            [{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
        ),
    )
    assert matches.height == 1
    assert matches["_listing_idx"].to_list() == [0]
    assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
    assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
 def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
    matches = _match_direct_epc(
        _listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
        _direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
    )
    assert matches.height == 1
    assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
    assert matches["_direct_epc_match_method"].to_list() == ["address"]
 def test_normalize_uprn_handles_types_and_floats() -> None:
    assert _normalize_uprn(None) is None
    assert _normalize_uprn("") is None
    assert _normalize_uprn("  100012345678 ") == "100012345678"
    assert _normalize_uprn(100012345678) == "100012345678"
    # An integral float normalises to its digits, NOT "1230".
    assert _normalize_uprn(123.0) == "123"
    # Non-integral / NaN floats are rejected rather than mangled.
    assert _normalize_uprn(1.5) is None
    assert _normalize_uprn(float("nan")) is None
 def _property_candidates(rows: list[dict]) -> pl.DataFrame:
    base = {
        "postcode": "AA1 1AA",
        "pp_address": "1 Example Road",
        "_property_match_postcode": "AA11AA",
        "_property_match_address": "1 EXAMPLE ROAD",
        "_property_epc_match_address": "1 EXAMPLE ROAD",
        "uprn": None,
    }
    return pl.DataFrame(
        [{**base, **row} for row in rows],
        schema={
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "_property_match_postcode": pl.Utf8,
            "_property_match_address": pl.Utf8,
            "_property_epc_match_address": pl.Utf8,
            "uprn": pl.Utf8,
        },
    )
 def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
    # Two listings claim the same property: one by UPRN, one by exact address
    # (both score 100). The UPRN match must win even though it has the higher
    # _listing_idx (which would otherwise break the tie the wrong way).
    listings = _listing_matches(
        [
            {
                "_listing_idx": 5,
                "_listing_uprn": "100000000001",
                "_listing_match_address": "SOMETHING ELSE",
            },
            {
                "_listing_idx": 1,
                "_listing_uprn": None,
                "_listing_match_address": "1 EXAMPLE ROAD",
            },
        ]
    )
    matches = _match_listing_properties(
        listings, _property_candidates([{"uprn": "100000000001"}])
    )
    assert matches.height == 1
    assert matches["_listing_idx"].to_list() == [5]
    assert matches["_property_match_method"].to_list() == ["uprn"]
 def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
    # Matching is by postcode/UPRN/street — never by coordinate proximity — so a
    # same-street EPC in a different postcode with no shared UPRN is skipped.
    matches = _match_direct_epc(
        _listing_matches([{"_listing_match_postcode": "AA11AA"}]),
        _direct_epc_candidates(
            [{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
        ),
    )
    assert matches.height == 0
 def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -
    assert other["_actual_listing_url"].to_list() == [None]
-def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
+def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
    # The listing's address deliberately does not match the property's, but the
    # shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().with_columns(
-        pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
+        pl.lit("Totally Different Road").alias("Address per Property Register"),
        pl.lit("100000000009").alias("UPRN"),
    ).write_parquet(listings_path)
    _stub_arcgis(arcgis_path)
    wide = pl.DataFrame(
        {
            "postcode": ["SW1A 1AA"],
            "pp_address": ["1 Example Road"],
            "uprn": ["100000000009"],
            "pp_property_type": ["Terraced"],
            "duration": ["Freehold"],
            "total_floor_area": [90.0],
            "number_habitable_rooms": [4],
            "latest_price": [600_000],
            "epc_address": ["1 Example Road"],
            "current_energy_rating": ["C"],
            "potential_energy_rating": ["B"],
            "floor_height": [2.4],
            "construction_age_band": [1930],
            "is_construction_date_approximate": [1],
            "was_council_house": ["No"],
        },
        schema={
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "uprn": pl.Utf8,
            "pp_property_type": pl.Utf8,
            "duration": pl.Utf8,
            "total_floor_area": pl.Float64,
            "number_habitable_rooms": pl.Int16,
            "latest_price": pl.Int64,
            "epc_address": pl.Utf8,
            "current_energy_rating": pl.Utf8,
            "potential_energy_rating": pl.Utf8,
            "floor_height": pl.Float64,
            "construction_age_band": pl.UInt16,
            "is_construction_date_approximate": pl.UInt8,
            "was_council_house": pl.Utf8,
        },
    )
    integrated = _integrate_listings(
        wide.lazy(), listings_path, arcgis_path, epc_path=None
    ).collect()
    matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
    # The listing overlay attached to the UPRN-matched property row.
    assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
    # No spurious seed row for the listing's (non-matching) address.
    assert "Totally Different Road" not in integrated["pp_address"].to_list()
 def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
    # A number-less listing whose street is not the property's street (and which
    # shares no UPRN) must not be force-matched onto it; it becomes its own seed
    # row instead of stamping the wrong property's overlay.
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().with_columns(
        pl.lit("Juniper Crescent").alias("Address per Property Register"),
    ).write_parquet(listings_path)
    _stub_arcgis(arcgis_path)
    wide = pl.DataFrame(
@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) ->
    ).collect()
    existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
-    seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
+    seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
    assert existing["_actual_listing_url"].to_list() == [None]
    assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
    # Overlay scaffolding is dropped.
    for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
        assert dst not in finalized.columns, src
 def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
    # The terminated-postcode remap can collapse two distinct wide rows onto the same
    # (postcode, pp_address), so a single matched listing attaches to both. Finalize
    # must emit one row per listing URL, not one per collapsed wide row.
    df = pl.DataFrame(
        {
            "Postcode": ["SW1A 1AA", "SW1A 1AA"],
            "Address per Property Register": ["1 Example Road", "1 Example Road"],
            "Address per EPC": ["1 Example Road", "1 Example Road"],
            "Date of last transaction": [1990.0, 1995.0],
            "lat": [51.5, 51.5],
            "lon": [-0.1, -0.1],
            "Total floor area (sqm)": [100.0, 95.0],
            "Number of bedrooms & living rooms": [3, 3],
            "Property type": ["Terraced", "Terraced"],
            "Leasehold/Freehold": ["Leasehold", "Leasehold"],
            "Last known price": [500_000, 480_000],
            "Street tree density percentile": [42.0, 42.0],
            # Same listing URL on both collapsed rows — the fan-out to fix.
            "_actual_listing_url": ["url0", "url0"],
            "_actual_asking_price": [600_000, 600_000],
            "_actual_asking_price_per_sqm": [5_000, 5_000],
            "_actual_listing_date": [None, None],
            "_actual_listing_status": ["For sale", "For sale"],
            "_actual_listing_features": [["Garden"], ["Garden"]],
            "_actual_bedrooms": [3, 3],
            "_actual_bathrooms": [1, 1],
            "_actual_price_qualifier": ["", ""],
            "_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
            "_actual_lat": [51.51, 51.51],
            "_actual_lon": [-0.11, -0.11],
            "_actual_total_floor_area": [110.0, 110.0],
            "_actual_number_habitable_rooms": [4, 4],
            "_actual_property_type": ["Terraced", "Terraced"],
            "_actual_leasehold_freehold": ["Freehold", "Freehold"],
        },
        schema={
            "Postcode": pl.Utf8,
            "Address per Property Register": pl.Utf8,
            "Address per EPC": pl.Utf8,
            "Date of last transaction": pl.Float64,
            "lat": pl.Float64,
            "lon": pl.Float64,
            "Total floor area (sqm)": pl.Float64,
            "Number of bedrooms & living rooms": pl.Int16,
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
            "Street tree density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
            "_actual_listing_date": pl.Datetime("us"),
            "_actual_listing_status": pl.Utf8,
            "_actual_listing_features": pl.List(pl.Utf8),
            "_actual_bedrooms": pl.Int32,
            "_actual_bathrooms": pl.Int32,
            "_actual_price_qualifier": pl.Utf8,
            "_actual_property_sub_type": pl.Utf8,
            "_actual_lat": pl.Float64,
            "_actual_lon": pl.Float64,
            "_actual_total_floor_area": pl.Float64,
            "_actual_number_habitable_rooms": pl.Int16,
            "_actual_property_type": pl.Utf8,
            "_actual_leasehold_freehold": pl.Utf8,
        },
    )
    finalized = _finalize_listings(df)
    assert finalized.height == 1
    assert finalized["Listing URL"].to_list() == ["url0"]
--- a/pipeline/transform/test_tree_density.py
+++ b/pipeline/transform/test_tree_density.py
@ -1,19 +1,83 @@
 import math
 from pathlib import Path
 import numpy as np
 import polars as pl
 import pytest
 import shapely
 from pipeline.transform.tree_density import (
    STREET_TREE_COVERAGE_COL,
    STREET_TREE_DENSITY_COL,
    _add_nfi_batch,
    _coverage_percentile_expr,
    _metric_columns,
    _postcode_buffers,
    _postcode_density_percentile_col,
    _with_postcode_density_percentiles,
    _write_street_rollups,
 )
 def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
    radius_m = 50
    points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
    circles, tree = _postcode_buffers(points, radius_m)
    buffer_area = math.pi * radius_m * radius_m
    # A large woodland square centred on postcode A fully covers A's circle.
    canopy_area = np.zeros(2)
    feature_count = np.zeros(2, dtype=np.uint32)
    big = shapely.box(-500, -500, 500, 500)  # 1,000,000 sqm parcel
    _add_nfi_batch(
        np.array([big], dtype=object),
        np.array(["Woodland"], dtype=object),
        circles,
        tree,
        canopy_area,
        feature_count,
        radius_m,
    )
    # Only the clipped circle area is added (the 32-gon buffer approximates the
    # circle to ~1%), NOT the full 1,000,000 sqm polygon.
    assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
    assert canopy_area[0] <= buffer_area  # never exceeds the buffer area
    assert canopy_area[1] == 0.0  # postcode B is 1km away, no overlap
    assert feature_count.tolist() == [1, 0]
    # A large parcel that only slivers into B's circle must add only the sliver,
    # not its full area -- the failure mode the old centroid path could not avoid.
    canopy_area = np.zeros(2)
    feature_count = np.zeros(2, dtype=np.uint32)
    sliver = shapely.box(1040, -500, 2000, 500)  # left edge 10m inside B's circle
    _add_nfi_batch(
        np.array([sliver], dtype=object),
        np.array(["Woodland"], dtype=object),
        circles,
        tree,
        canopy_area,
        feature_count,
        radius_m,
    )
    assert canopy_area[0] == 0.0
    assert 0.0 < canopy_area[1] < buffer_area  # tiny segment, far below 1M sqm
    # Non-woodland categories contribute nothing.
    canopy_area = np.zeros(2)
    feature_count = np.zeros(2, dtype=np.uint32)
    _add_nfi_batch(
        np.array([big], dtype=object),
        np.array(["Non woodland"], dtype=object),
        circles,
        tree,
        canopy_area,
        feature_count,
        radius_m,
    )
    assert canopy_area.tolist() == [0.0, 0.0]
    assert feature_count.tolist() == [0, 0]
 def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
    df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})
--- a/pipeline/transform/tree_density.py
+++ b/pipeline/transform/tree_density.py
@ -1,10 +1,16 @@
-"""Derive street-scale tree density metrics from Forest Research TOW data.
+"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
 The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
 inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
 postcode-level metric from the tree polygons, then optionally rolls that up to
 Price Paid street names so the dashboard can answer "what is this address's
 street like?" without loading the full geodatabase at runtime.
 TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
 woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
 proximity (tiny crowns), while large NFI woodland parcels are accumulated by
 true buffer-clipped intersection area so they cannot saturate a postcode from
 mere centroid proximity.
 """
 from __future__ import annotations
@ -22,7 +28,6 @@ import shapely
 from scipy.spatial import cKDTree
 DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
 TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
 STREET_TREE_DENSITY_COL = "Street tree density percentile"
 STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
 POSTCODE_COUNT_COL = "Tree features within {radius}m"
 POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
 # National Forest Inventory (NFI) woodland — the geometric complement of TOW.
 # NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
 # Field names are from the NFI Woodland England 2022 release; re-check on bumps.
 NFI_CATEGORY_COL = "CATEGORY"
 NFI_WOODLAND_VALUE = "Woodland"
 NFI_TYPE_COL = "IFT_IOA"
 NFI_AREA_HA_COL = "Area_ha"
 def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
    """Extract the TOW zip and return the extracted .gdb path."""
@ -83,12 +96,60 @@ def _tow_dataset_path(
    return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
-def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
+def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
-    if not tow_types:
+    """Extract an arbitrary zip into extract_dir and return the directory."""
-        return None
+    if extract_dir.exists() and not force:
-    escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
+        print(f"Using existing extraction directory: {extract_dir}")
-    values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
+        return extract_dir
-    return f"Woodland_Type IN ({values})"
+    if extract_dir.exists():
        shutil.rmtree(extract_dir)
    tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True)
    root = tmp_dir.resolve()
    print(f"Extracting {zip_path} to {extract_dir}...")
    with zipfile.ZipFile(zip_path) as archive:
        for member in archive.infolist():
            target = (tmp_dir / member.filename).resolve()
            if root != target and root not in target.parents:
                raise ValueError(f"Unsafe path in zip archive: {member.filename}")
            if member.is_dir():
                target.mkdir(parents=True, exist_ok=True)
                continue
            target.parent.mkdir(parents=True, exist_ok=True)
            with archive.open(member) as source, target.open("wb") as dest:
                shutil.copyfileobj(source, dest, length=1024 * 1024)
    tmp_dir.rename(extract_dir)
    print(f"Extracted archive: {extract_dir}")
    return extract_dir
 def _nfi_dataset_path(
    zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
 ) -> str:
    """Resolve the NFI woodland shapefile path, extracting the zip if needed."""
    if use_vsizip:
        return f"/vsizip/{zip_path.resolve()}"
    extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
    shapefiles = sorted(extracted.rglob("*.shp"))
    if not shapefiles:
        raise FileNotFoundError(f"No .shp found inside {zip_path}")
    return str(shapefiles[0])
 def _geometry_column(metadata: dict, column_names: list[str]) -> str:
    """Resolve the geometry column name from pyogrio Arrow metadata."""
    geometry_name = metadata.get("geometry_name")
    if geometry_name:
        return str(geometry_name)
    for name in ("wkb_geometry", "geometry", "geom"):
        if name in column_names:
            return name
    return column_names[-1]
 def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
@ -172,26 +233,20 @@ def _accumulate_tree_metrics(
    dataset_path: str,
    points: pl.DataFrame,
    radius_m: int,
    tow_types: tuple[str, ...] | None,
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
    workers: int,
-) -> pl.DataFrame:
+    canopy_area: np.ndarray,
    feature_count: np.ndarray,
    height_weighted_sum: np.ndarray,
    height_weight: np.ndarray,
 ) -> None:
    xy = points.select("x", "y").to_numpy()
    tree = cKDTree(xy)
    n_points = points.height
    canopy_area = np.zeros(n_points, dtype=np.float64)
    feature_count = np.zeros(n_points, dtype=np.uint32)
    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
    height_weight = np.zeros(n_points, dtype=np.float64)
    where = _where_for_tow_types(tow_types)
    layers = _layers(dataset_path, layer_names)
    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
    if where:
        print(f"TOW type filter: {where}")
    columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
    total_features_seen = 0
@ -206,7 +261,6 @@ def _accumulate_tree_metrics(
            dataset_path,
            layer=layer,
            columns=columns,
            where=where,
            batch_size=batch_size,
            use_pyarrow=True,
        ) as (_meta, reader):
@ -297,6 +351,132 @@ def _accumulate_tree_metrics(
                        f"{total_features_used:,} features with usable centroids"
                    )
 def _postcode_buffers(
    points: pl.DataFrame, radius_m: int
 ) -> tuple[np.ndarray, shapely.STRtree]:
    """Build a radius-r circle for every postcode plus an STRtree over them.
    Circle index == postcode index, matching the order used by the cKDTree path.
    """
    xy = points.select("x", "y").to_numpy()
    circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
    return circles, shapely.STRtree(circles)
 def _add_nfi_batch(
    geoms: np.ndarray,
    category: np.ndarray,
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
    feature_count: np.ndarray,
    radius_m: int,
 ) -> None:
    """Add NFI woodland into the shared arrays by true buffer-clipped area.
    Unlike the TOW centroid path, this clips each woodland polygon to each
    nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
    therefore cannot saturate a postcode from mere centroid proximity, and a
    buffer-filling parcel whose centroid is outside the radius is not missed.
    """
    keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
    geoms = geoms[keep]
    if geoms.size:
        geoms = geoms[~shapely.is_empty(geoms)]
    if geoms.size == 0:
        return
    # dwithin(polygon, point, r) is true iff the radius-r circle around the
    # point intersects the polygon -- exactly the candidate set we want.
    nfi_index, postcode_index = tree.query(
        geoms, predicate="dwithin", distance=radius_m
    )
    if nfi_index.size == 0:
        return
    clipped_area = shapely.area(
        shapely.intersection(geoms[nfi_index], circles[postcode_index])
    )
    positive = clipped_area > 0
    postcode_index = postcode_index[positive]
    clipped_area = clipped_area[positive]
    np.add.at(canopy_area, postcode_index, clipped_area)
    np.add.at(feature_count, postcode_index, 1)
 def _accumulate_nfi_metrics(
    dataset_path: str,
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
    feature_count: np.ndarray,
    radius_m: int,
    batch_size: int,
    max_nfi_features: int | None,
 ) -> None:
    layers = _layers(dataset_path, None)
    print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
    # Density only needs the woodland flag + geometry; area is clipped from the
    # postcode buffer, not read from the file.
    columns = [NFI_CATEGORY_COL]
    features_seen = 0
    for layer in layers:
        with pyogrio.open_arrow(
            dataset_path,
            layer=layer,
            columns=columns,
            batch_size=batch_size,
            use_pyarrow=True,
        ) as (meta, reader):
            for batch_index, batch in enumerate(reader, start=1):
                if max_nfi_features is not None:
                    remaining = max_nfi_features - features_seen
                    if remaining <= 0:
                        break
                    if batch.num_rows > remaining:
                        batch = batch.slice(0, remaining)
                features_seen += batch.num_rows
                names = batch.schema.names
                geometry_column = _geometry_column(meta, names)
                category = np.asarray(
                    batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
                        zero_copy_only=False
                    ),
                    dtype=object,
                )
                geometry = np.asarray(
                    batch.column(names.index(geometry_column)).to_numpy(
                        zero_copy_only=False
                    ),
                    dtype=object,
                )
                _add_nfi_batch(
                    shapely.from_wkb(geometry),
                    category,
                    circles,
                    tree,
                    canopy_area,
                    feature_count,
                    radius_m,
                )
                if batch_index == 1 or batch_index % 25 == 0:
                    print(f"  NFI batch {batch_index:,}: {features_seen:,} rows read")
 def _finalize_metrics(
    points: pl.DataFrame,
    canopy_area: np.ndarray,
    feature_count: np.ndarray,
    height_weighted_sum: np.ndarray,
    height_weight: np.ndarray,
    radius_m: int,
 ) -> pl.DataFrame:
    n_points = points.height
    density_col, area_col, count_col, height_col = _metric_columns(radius_m)
    buffer_area = math.pi * radius_m * radius_m
    density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
@ -518,6 +698,18 @@ def main() -> None:
        action="store_true",
        help="Read the geodatabase directly from the zip instead of extracting it",
    )
    parser.add_argument(
        "--nfi-zip",
        type=Path,
        default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
        help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
    )
    parser.add_argument(
        "--nfi-extract-dir",
        type=Path,
        default=Path("property-data/nfi_woodland_england"),
        help="Directory where the NFI zip is extracted",
    )
    parser.add_argument(
        "--arcgis",
        type=Path,
@ -554,11 +746,6 @@ def main() -> None:
        default=50,
        help="Radius around each postcode centroid used as the street-scale buffer",
    )
    parser.add_argument(
        "--tow-types",
        default=",".join(DEFAULT_TOW_TYPES),
        help='Comma-separated Woodland_Type values to include, or "all"',
    )
    parser.add_argument(
        "--layers",
        default=None,
@ -588,6 +775,12 @@ def main() -> None:
        default=None,
        help="Testing only: process at most N TOW features per layer",
    )
    parser.add_argument(
        "--max-nfi-features",
        type=int,
        default=None,
        help="Testing only: process at most N NFI woodland features",
    )
    args = parser.parse_args()
    if (args.output_streets or args.output_addresses) and args.price_paid is None:
@ -600,18 +793,53 @@ def main() -> None:
        args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
    )
    points = _postcode_points(args.arcgis, args.max_postcodes)
    tow_types = _parse_csv_arg(args.tow_types)
    layer_names = _parse_csv_arg(args.layers)
-    postcode_metrics = _accumulate_tree_metrics(
+    n_points = points.height
    canopy_area = np.zeros(n_points, dtype=np.float64)
    feature_count = np.zeros(n_points, dtype=np.uint32)
    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
    height_weight = np.zeros(n_points, dtype=np.float64)
    _accumulate_tree_metrics(
        dataset_path=dataset_path,
        points=points,
        radius_m=args.radius_m,
        tow_types=tow_types,
        batch_size=args.batch_size,
        layer_names=layer_names,
        max_features_per_layer=args.max_features_per_layer,
        workers=args.workers,
        canopy_area=canopy_area,
        feature_count=feature_count,
        height_weighted_sum=height_weighted_sum,
        height_weight=height_weight,
    )
    if args.nfi_zip is not None and args.nfi_zip.exists():
        nfi_path = _nfi_dataset_path(
            args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
        )
        circles, nfi_tree = _postcode_buffers(points, args.radius_m)
        _accumulate_nfi_metrics(
            dataset_path=nfi_path,
            circles=circles,
            tree=nfi_tree,
            canopy_area=canopy_area,
            feature_count=feature_count,
            radius_m=args.radius_m,
            batch_size=args.batch_size,
            max_nfi_features=args.max_nfi_features,
        )
    elif args.nfi_zip is not None:
        print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
    postcode_metrics = _finalize_metrics(
        points,
        canopy_area,
        feature_count,
        height_weighted_sum,
        height_weight,
        args.radius_m,
    )
    postcode_metrics = _with_postcode_density_percentiles(
        postcode_metrics, args.radius_m
--- a/pipeline/transform/tree_overlay_tiles.py
+++ b/pipeline/transform/tree_overlay_tiles.py
@ -1,4 +1,4 @@
-"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
+"""Build PMTiles polygon tiles for the Trees Outside Woodland + NFI overlay."""
 from __future__ import annotations
@ -16,10 +16,14 @@ from pyproj import Transformer
 from pipeline.local_temp import local_tmp_dir
 from pipeline.transform.tree_density import (
-    DEFAULT_TOW_TYPES,
+    NFI_AREA_HA_COL,
    NFI_CATEGORY_COL,
    NFI_TYPE_COL,
    NFI_WOODLAND_VALUE,
    _geometry_column,
    _layers,
    _nfi_dataset_path,
    _tow_dataset_path,
    _where_for_tow_types,
 )
@ -55,17 +59,13 @@ def _number_or_none(value) -> float | int | None:
 def _write_tree_geojsonseq(
    dataset_path: str,
    output_path: Path,
    tow_types: tuple[str, ...],
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
 ) -> int:
    to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
    where = _where_for_tow_types(tow_types)
    layers = _layers(dataset_path, layer_names)
    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
    if where:
        print(f"TOW type filter: {where}")
    columns = [
        "TOW_ID",
@ -88,7 +88,6 @@ def _write_tree_geojsonseq(
                dataset_path,
                layer=layer,
                columns=columns,
                where=where,
                batch_size=batch_size,
                use_pyarrow=True,
            ) as (_meta, reader):
@ -136,6 +135,7 @@ def _write_tree_geojsonseq(
                    for idx, geometry_json in zip(valid_indexes, geometries_json):
                        properties = {
                            "source": "tow",
                            "tow_id": str(tow_id[idx]) if tow_id is not None else "",
                            "woodland_type": (
                                str(woodland_type[idx])
@ -176,11 +176,105 @@ def _write_tree_geojsonseq(
    return feature_count
 def _append_nfi_geojsonseq(
    dataset_path: str,
    output_path: Path,
    batch_size: int,
    max_nfi_features: int | None,
 ) -> int:
    """Append NFI woodland polygons to the same GeoJSONSeq as the TOW features."""
    to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
    layers = _layers(dataset_path, None)
    print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
    columns = [NFI_CATEGORY_COL, NFI_TYPE_COL, NFI_AREA_HA_COL]
    feature_count = 0
    features_seen = 0
    with output_path.open("a") as file:
        for layer in layers:
            with pyogrio.open_arrow(
                dataset_path,
                layer=layer,
                columns=columns,
                batch_size=batch_size,
                use_pyarrow=True,
            ) as (meta, reader):
                for batch in reader:
                    if max_nfi_features is not None:
                        remaining = max_nfi_features - features_seen
                        if remaining <= 0:
                            break
                        if batch.num_rows > remaining:
                            batch = batch.slice(0, remaining)
                    features_seen += batch.num_rows
                    names = batch.schema.names
                    geometry_column = _geometry_column(meta, names)
                    category = np.asarray(
                        batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
                            zero_copy_only=False
                        ),
                        dtype=object,
                    )
                    geometry = np.asarray(
                        batch.column(names.index(geometry_column)).to_numpy(
                            zero_copy_only=False
                        ),
                        dtype=object,
                    )
                    valid = category == NFI_WOODLAND_VALUE
                    if not valid.any():
                        continue
                    woodland_type = _column_or_none(batch, names, NFI_TYPE_COL)
                    area_ha = _column_or_none(batch, names, NFI_AREA_HA_COL)
                    geometries = shapely.from_wkb(geometry[valid])
                    geometries = shapely.transform(
                        geometries,
                        to_wgs84.transform,
                        interleaved=False,
                    )
                    geometries_json = shapely.to_geojson(geometries)
                    valid_indexes = np.flatnonzero(valid)
                    for idx, geometry_json in zip(valid_indexes, geometries_json):
                        area_sqm = (
                            _number_or_none(area_ha[idx] * 10000.0)
                            if area_ha is not None
                            else None
                        )
                        properties = {
                            "source": "nfi",
                            "tow_id": "",
                            "woodland_type": (
                                str(woodland_type[idx])
                                if woodland_type is not None
                                else ""
                            ),
                            "area_sqm": area_sqm,
                            "mean_height_m": None,
                            "min_height_m": None,
                            "max_height_m": None,
                            "lidar_year": None,
                            "source_layer": layer,
                        }
                        feature = {
                            "type": "Feature",
                            "geometry": json.loads(geometry_json),
                            "properties": properties,
                        }
                        file.write(json.dumps(feature, separators=(",", ":")) + "\n")
                        feature_count += 1
    return feature_count
 def build_tree_overlay_tiles(
    tow_zip: Path,
    output_path: Path,
    extract_dir: Path,
    tow_types: tuple[str, ...],
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
@ -188,6 +282,9 @@ def build_tree_overlay_tiles(
    max_zoom: int,
    force_extract: bool,
    use_vsizip: bool,
    nfi_zip: Path | None = None,
    nfi_extract_dir: Path = Path("property-data/nfi_woodland_england"),
    max_nfi_features: int | None = None,
 ) -> None:
    tippecanoe = _require_tippecanoe()
    dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
@ -198,13 +295,26 @@ def build_tree_overlay_tiles(
        feature_count = _write_tree_geojsonseq(
            dataset_path,
            ndjson_path,
            tow_types,
            batch_size,
            layer_names,
            max_features_per_layer,
        )
        print(f"Writing {feature_count:,} TOW polygon features")
        if nfi_zip is not None and nfi_zip.exists():
            nfi_path = _nfi_dataset_path(
                nfi_zip, nfi_extract_dir, force_extract, use_vsizip
            )
            nfi_count = _append_nfi_geojsonseq(
                nfi_path,
                ndjson_path,
                batch_size,
                max_nfi_features,
            )
            print(f"Writing {nfi_count:,} NFI woodland polygon features")
        elif nfi_zip is not None:
            print(f"NFI zip not found, skipping woodland union: {nfi_zip}")
        subprocess.run(
            [
                tippecanoe,
@ -237,26 +347,32 @@ def main() -> None:
        default=Path("property-data/fr_tow_v1_all"),
        help="Directory used to extract the FileGDB",
    )
    parser.add_argument(
        "--tow-type",
        action="append",
        dest="tow_types",
        help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
    )
    parser.add_argument("--batch-size", type=int, default=50_000)
    parser.add_argument("--layer", action="append", dest="layers")
    parser.add_argument("--max-features-per-layer", type=int)
-    parser.add_argument("--min-zoom", type=int, default=15)
+    parser.add_argument("--min-zoom", type=int, default=12)
    parser.add_argument("--max-zoom", type=int, default=17)
    parser.add_argument("--force-extract", action="store_true")
    parser.add_argument("--use-vsizip", action="store_true")
    parser.add_argument(
        "--nfi-zip",
        type=Path,
        default=None,
        help="Optional NFI woodland shapefile zip to union into the overlay",
    )
    parser.add_argument(
        "--nfi-extract-dir",
        type=Path,
        default=Path("property-data/nfi_woodland_england"),
        help="Directory used to extract the NFI zip",
    )
    parser.add_argument("--max-nfi-features", type=int)
    args = parser.parse_args()
    build_tree_overlay_tiles(
        tow_zip=args.tow_zip,
        output_path=args.output,
        extract_dir=args.extract_dir,
        tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
        batch_size=args.batch_size,
        layer_names=tuple(args.layers) if args.layers else None,
        max_features_per_layer=args.max_features_per_layer,
@ -264,6 +380,9 @@ def main() -> None:
        max_zoom=args.max_zoom,
        force_extract=args.force_extract,
        use_vsizip=args.use_vsizip,
        nfi_zip=args.nfi_zip,
        nfi_extract_dir=args.nfi_extract_dir,
        max_nfi_features=args.max_nfi_features,
    )
--- a/pipeline/validate_outputs.py
+++ b/pipeline/validate_outputs.py
@ -3,6 +3,7 @@
 from __future__ import annotations
 import argparse
 import json
 import sys
 import zipfile
 from pathlib import Path
@ -76,6 +77,24 @@ def _split_glob(spec: str) -> tuple[Path, str]:
    return Path(base), pattern
 def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
    if "::" not in spec:
        raise argparse.ArgumentTypeError(
            f"{spec!r} must use LEFT::RIGHT for {label}"
        )
    left, right = spec.split("::", 1)
    if not left or not right:
        raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
    return Path(left), Path(right)
 def _canonical_postcode(value: object) -> str:
    compact = "".join(str(value).split()).upper()
    if len(compact) >= 5:
        return f"{compact[:-3]} {compact[-3:]}"
    return compact
 def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
    base, pattern = _split_glob(spec)
    if not base.exists():
@ -105,6 +124,79 @@ def _failures_for_zip_glob(spec: str) -> list[str]:
    return failures
 def _postcode_column(columns: list[str]) -> str | None:
    for name in ("postcode", "Postcode", "pcds", "PCDS"):
        if name in columns:
            return name
    return None
 def _parquet_postcodes(path: Path) -> set[str]:
    schema = pl.scan_parquet(path).collect_schema()
    column = _postcode_column(schema.names())
    if column is None:
        raise ValueError(f"{path}: missing postcode column")
    values = (
        pl.scan_parquet(path)
        .select(pl.col(column).drop_nulls().unique())
        .collect()
        .get_column(column)
        .to_list()
    )
    return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
 def _boundary_postcodes(path: Path) -> set[str]:
    units_dir = path / "units" if (path / "units").is_dir() else path
    postcodes: set[str] = set()
    for geojson_path in sorted(units_dir.glob("*.geojson")):
        with geojson_path.open("r", encoding="utf-8") as handle:
            data = json.load(handle)
        for feature in data.get("features", []):
            properties = feature.get("properties") or {}
            value = properties.get("postcodes")
            if value is not None:
                postcode = _canonical_postcode(value)
                if postcode:
                    postcodes.add(postcode)
    return postcodes
 def _sample(values: set[str]) -> str:
    return ", ".join(sorted(values)[:10])
 def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
    parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
    failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
    if failures:
        return failures
    try:
        parquet_postcodes = _parquet_postcodes(parquet_path)
        boundary_postcodes = _boundary_postcodes(boundaries_path)
    except Exception as exc:
        return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
    failures = []
    if not boundary_postcodes:
        failures.append(f"{boundaries_path}: no boundary postcodes found")
    missing_boundaries = parquet_postcodes - boundary_postcodes
    orphan_boundaries = boundary_postcodes - parquet_postcodes
    if missing_boundaries:
        failures.append(
            f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
            f"are missing boundaries; sample: {_sample(missing_boundaries)}"
        )
    if orphan_boundaries:
        failures.append(
            f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
            f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
        )
    return failures
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--file", action="append", default=[], type=Path)
@ -123,6 +215,12 @@ def main() -> int:
        default=[],
        help="Require at least one readable zip matching BASE::PATTERN",
    )
    parser.add_argument(
        "--postcode-boundary-match",
        action="append",
        default=[],
        help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
    )
    args = parser.parse_args()
    failures: list[str] = []
@ -138,6 +236,8 @@ def main() -> int:
        failures.extend(_failures_for_glob(spec))
    for spec in args.zip_glob:
        failures.extend(_failures_for_zip_glob(spec))
    for spec in args.postcode_boundary_match:
        failures.extend(_failures_for_postcode_boundary_match(spec))
    if failures:
        print("Output validation failed:", file=sys.stderr)