From 8688b7475e6d9a270b04e7318d6d5d248f14bdff Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 31 May 2026 15:36:33 +0100 Subject: [PATCH] scraping and data --- .gitignore | 2 + finder/Dockerfile | 25 + finder/constants.py | 52 +- finder/docker-compose.yml | 57 ++ finder/flaresolverr.py | 91 +++ finder/gdal-ecw/Dockerfile | 53 ++ finder/http_client.py | 4 +- finder/main.py | 39 +- finder/onthemarket.py | 219 ++++++- finder/rightmove.py | 228 +++++++- finder/scraper.py | 100 ++++ finder/storage.py | 10 + finder/test_onthemarket.py | 206 +++++++ finder/test_rightmove.py | 113 ++++ finder/test_transform.py | 129 +++- finder/test_zoopla.py | 288 +++++++++ finder/transform.py | 101 +++- finder/zoopla.py | 381 +++++++++++- finder/zoopla_flaresolverr.py | 164 ++++++ frontend/src/components/map/Map.tsx | 13 +- .../src/components/map/MobileDrawer.test.tsx | 107 ++++ frontend/src/lib/color-opacity.ts | 11 + frontend/src/lib/crime-types.ts | 35 ++ pipeline/download/inspire.py | 5 +- pipeline/download/satellite_highres.py | 505 ++++++++++++++++ pipeline/download/test_satellite_highres.py | 97 ++++ pipeline/test_validate_outputs.py | 52 ++ pipeline/transform/crime_spatial.py | 358 ++++++++++++ pipeline/transform/join_epc_pp.py | 3 + pipeline/transform/merge.py | 549 +++++++----------- pipeline/transform/noise_overlay_tiles.py | 2 +- .../transform/postcode_boundaries/__main__.py | 8 +- .../transform/postcode_boundaries/loader.py | 105 ++++ .../test_postcode_boundaries.py | 44 ++ .../transform/postcode_boundaries/uprn.py | 38 +- pipeline/transform/property_border_tiles.py | 138 +++++ pipeline/transform/test_crime_spatial.py | 147 +++++ pipeline/transform/test_join_epc_pp.py | 2 + pipeline/transform/test_merge.py | 367 ++++++++++-- pipeline/transform/test_tree_density.py | 64 ++ pipeline/transform/tree_density.py | 284 ++++++++- pipeline/transform/tree_overlay_tiles.py | 155 ++++- pipeline/validate_outputs.py | 100 ++++ 43 files changed, 4920 insertions(+), 531 deletions(-) create mode 100644 finder/Dockerfile create mode 100644 finder/docker-compose.yml create mode 100644 finder/flaresolverr.py create mode 100644 finder/gdal-ecw/Dockerfile create mode 100644 finder/test_onthemarket.py create mode 100644 finder/test_rightmove.py create mode 100644 finder/test_zoopla.py create mode 100644 finder/zoopla_flaresolverr.py create mode 100644 frontend/src/components/map/MobileDrawer.test.tsx create mode 100644 frontend/src/lib/color-opacity.ts create mode 100644 frontend/src/lib/crime-types.ts create mode 100644 pipeline/download/satellite_highres.py create mode 100644 pipeline/download/test_satellite_highres.py create mode 100644 pipeline/transform/crime_spatial.py create mode 100644 pipeline/transform/postcode_boundaries/loader.py create mode 100644 pipeline/transform/property_border_tiles.py create mode 100644 pipeline/transform/test_crime_spatial.py diff --git a/.gitignore b/.gitignore index 4b1aeaa..6ab0ba9 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,8 @@ video/auth.* *.jpeg *.mp4 +**/*.log + r5-java/tmp property-data property-data2 diff --git a/finder/Dockerfile b/finder/Dockerfile new file mode 100644 index 0000000..bb5f2b5 --- /dev/null +++ b/finder/Dockerfile @@ -0,0 +1,25 @@ +# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN +# network namespace; the source tree is bind-mounted at runtime, so this image +# only needs the Python deps. The venv lives OUTSIDE the bind-mount target +# (/opt/venv) so the mount doesn't shadow it. +FROM python:3.12-slim + +ENV UV_PROJECT_ENVIRONMENT=/opt/venv \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +WORKDIR /app/finder + +# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime). +COPY pyproject.toml uv.lock ./ +RUN uv sync --no-install-project --frozen + +# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv. +CMD ["sleep", "infinity"] diff --git a/finder/constants.py b/finder/constants.py index 82a834b..17e6938 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data"))) ARCGIS_PATH = Path( - os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")) + os.environ.get( + "ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet") + ) ) PAGE_SIZE = 24 DELAY_BETWEEN_PAGES = 0.3 @@ -19,6 +21,19 @@ MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsi TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" RIGHTMOVE_BASE = "https://www.rightmove.co.uk" +# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds +# propertyData.address.{outcode,incode}, which together form the property's TRUE +# full postcode — the search API only exposes the outcode. {id} is the numeric +# listing id from the search response. +RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}" + +# The Rightmove search API gives only an outcode-level display address, so the +# true full postcode is recovered from each listing's detail page (see +# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a +# big throughput increase over the ~1000-result-per-outcode search, so detail +# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON. +RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes +RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode # OnTheMarket ONTHEMARKET_BASE = "https://www.onthemarket.com" @@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com" # Zoopla ZOOPLA_BASE = "https://www.zoopla.co.uk" +# Zoopla search cards only carry an outcode-level address, so the full postcode +# and precise coordinates are scraped from each listing's detail page. These +# bound that extra work (see finder/zoopla.py and finder/scraper.py). +ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes +ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode +ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout +# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS) +# spent fetching details; the remainder is reserved for search pagination so +# detail fetches can never trip the timeout and discard collected listings. +ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6 + +# Gluetun VPN. Network endpoints are env-overridable because they are +# deployment-specific: when finder runs in a SEPARATE container they use the +# `gluetun` hostname (defaults below); when finder SHARES gluetun's network +# namespace (docker-compose.yml, network_mode container:media_gluetun) they +# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels +# all traffic, so no HTTP proxy is needed). +# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns. +GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None +GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000") +GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K" +# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla: +# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would +# rotate away from the IP a cleared Cloudflare session was bound to, voiding it. +# Raise only with residential IPs where rotation helps. +GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge + +# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr +# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the +# RSC flight stream with postcode + coordinates; "camoufox" drives a local +# anti-fingerprint browser (needs an interactive solve on datacenter IPs). +ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr") +FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1") +FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow + # Greater London-ish postcode areas. This intentionally uses broad area # prefixes so a manual scrape can include central/inner London plus common # outer-London and near-London outcodes without maintaining a long borough list. diff --git a/finder/docker-compose.yml b/finder/docker-compose.yml new file mode 100644 index 0000000..af87d52 --- /dev/null +++ b/finder/docker-compose.yml @@ -0,0 +1,57 @@ +# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN +# container's network namespace. Everything egresses through the VPN, and +# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed). +# +# Prerequisites: +# - The `media_gluetun` container (qmcgaw/gluetun) is running on this host. +# It is managed by a different compose; it is referenced here as external +# via network_mode "container:media_gluetun". +# - Because these services share gluetun's netns, they reach each other and +# gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need +# NO published ports (which is exactly why this avoids the dev-container +# port-forwarding pain). +# +# Usage: +# cd finder +# docker compose up -d --build flaresolverr finder # start the sidecars +# docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test +# docker compose exec finder uv run python main.py --source all # full run +# docker compose down +# +# NOTE: a manually-started `finder_flaresolverr` container from testing must be +# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash. + +services: + flaresolverr: + image: ghcr.io/flaresolverr/flaresolverr:latest + container_name: finder_flaresolverr + network_mode: "container:media_gluetun" + environment: + LOG_LEVEL: info + TZ: Europe/London + restart: unless-stopped + + finder: + build: + context: . + dockerfile: Dockerfile + image: finder-scraper:latest + container_name: finder_scraper + network_mode: "container:media_gluetun" + depends_on: + - flaresolverr + volumes: + - .:/app/finder # live-mounted finder source + - ../property-data:/app/property-data:ro # ARCGIS postcode data + working_dir: /app/finder + environment: + # Shared netns: sidecars are on localhost, and the netns already tunnels + # all traffic through the VPN, so no HTTP proxy is used. + ZOOPLA_FETCHER: flaresolverr + FLARESOLVERR_URL: http://localhost:8191/v1 + GLUETUN_CONTROL_URL: http://localhost:8000 + GLUETUN_PROXY: "" # empty => direct (shared netns already tunnels) + DATA_DIR: /app/finder/data + ARCGIS_PATH: /app/property-data/arcgis_data.parquet + restart: "no" + command: ["sleep", "infinity"] # stays up; run scrapes via `docker compose exec` diff --git a/finder/flaresolverr.py b/finder/flaresolverr.py new file mode 100644 index 0000000..dd91222 --- /dev/null +++ b/finder/flaresolverr.py @@ -0,0 +1,91 @@ +"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML. + +FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an +undetected browser to pass Cloudflare's challenge and returns the fully +rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing +the Gluetun VPN network namespace, so its browser egresses through the VPN. + +Verified working against Zoopla's managed Turnstile on a datacenter VPN IP, +provided a reused session and a generous maxTimeout (~120s) — the first +challenge solve is slow, subsequent requests on the warm session are fast. +""" + +import logging + +import httpx + +from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL + +log = logging.getLogger("flaresolverr") + + +class FlareSolverrError(Exception): + """Raised when FlareSolverr cannot fetch/solve a URL.""" + + +class FlareSolverrSession: + """A reusable FlareSolverr browser session (context manager). + + Reusing one session keeps the cleared Cloudflare cookies warm across + requests, so only the first fetch pays the full challenge-solve cost.""" + + def __init__( + self, + url: str = FLARESOLVERR_URL, + session: str = "finder", + max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS, + ) -> None: + self._url = url + self._session = session + self._max_timeout = max_timeout_ms + # Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks + # for up to maxTimeout while solving before responding). + self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30)) + self._active = False + + def _post(self, payload: dict) -> dict: + try: + resp = self._client.post(self._url, json=payload) + resp.raise_for_status() + data = resp.json() + except (httpx.HTTPError, ValueError) as exc: + raise FlareSolverrError( + f"FlareSolverr request to {self._url} failed: {exc}" + ) from exc + if data.get("status") != "ok": + raise FlareSolverrError( + f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}" + ) + return data + + def __enter__(self) -> "FlareSolverrSession": + # Start from a clean session (ignore destroy errors for a fresh name). + try: + self._post({"cmd": "sessions.destroy", "session": self._session}) + except FlareSolverrError: + pass + self._post({"cmd": "sessions.create", "session": self._session}) + self._active = True + log.info("FlareSolverr session %r ready at %s", self._session, self._url) + return self + + def get(self, url: str) -> str: + """Fetch a URL through FlareSolverr; return the solved HTML.""" + data = self._post( + { + "cmd": "request.get", + "session": self._session, + "url": url, + "maxTimeout": self._max_timeout, + } + ) + solution = data.get("solution") or {} + return solution.get("response", "") or "" + + def __exit__(self, *exc_info) -> None: + if self._active: + try: + self._post({"cmd": "sessions.destroy", "session": self._session}) + except FlareSolverrError as exc: + log.debug("FlareSolverr session destroy failed: %s", exc) + self._client.close() diff --git a/finder/gdal-ecw/Dockerfile b/finder/gdal-ecw/Dockerfile new file mode 100644 index 0000000..d272d22 --- /dev/null +++ b/finder/gdal-ecw/Dockerfile @@ -0,0 +1,53 @@ +# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial +# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py). +# +# EA VAP ships as ECW **v2** rasters, which are readable by the open-source +# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built +# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon +# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing +# restrictions entirely. +# +# We build only the ECW driver as a GDAL *plugin* on top of the official runtime +# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact +# commit reported by the base image so libgdal and the plugin stay ABI-compatible. +# +# Build: docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw +# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW + +FROM ghcr.io/osgeo/gdal:ubuntu-full-latest + +ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz + +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake g++ make git curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable. +RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \ + && tar -C / -xzf /tmp/libecwj2.tar.gz \ + && rm -f /tmp/libecwj2.tar.gz \ + && (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \ + ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \ + done) \ + && ldconfig + +# Build the ECW driver plugin against the base image's exact GDAL sources. +RUN set -eux; \ + GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \ + test -n "$GDAL_COMMIT"; \ + echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \ + mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \ + git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \ + git checkout -q FETCH_HEAD; \ + cmake -S frmts/ecw -B /tmp/ecw-build \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH=/usr \ + -DECW_ROOT=/opt/libecwj2-3.3; \ + cmake --build /tmp/ecw-build -j"$(nproc)"; \ + PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \ + mkdir -p "$PLUGIN_DIR"; \ + find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \ + rm -rf /tmp/gdal /tmp/ecw-build + +# Fail the build if the driver is not actually available. +RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK" diff --git a/finder/http_client.py b/finder/http_client.py index b803c84..3f7aaa0 100644 --- a/finder/http_client.py +++ b/finder/http_client.py @@ -5,7 +5,7 @@ import time import httpx from fake_useragent import UserAgent -from constants import MAX_RETRIES, RETRY_BASE_DELAY +from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY log = logging.getLogger("rightmove") @@ -15,10 +15,12 @@ _ua = UserAgent( def make_client() -> httpx.Client: + # Route through the Gluetun HTTP proxy (VPN egress) when configured. return httpx.Client( timeout=30, headers={"User-Agent": _ua.random, "Accept": "application/json"}, follow_redirects=True, + proxy=GLUETUN_PROXY or None, ) diff --git a/finder/main.py b/finder/main.py index e6ce585..15b6811 100644 --- a/finder/main.py +++ b/finder/main.py @@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace: default=DATA_DIR, help=f"Directory for parquet output. Defaults to {DATA_DIR}.", ) + parser.add_argument( + "--outcodes", + type=str, + default=None, + help=( + "Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') " + "instead of the full Greater London set. Must fall within the " + "London-ish areas; takes precedence over --test/--limit-outcodes." + ), + ) parser.add_argument( "--limit-outcodes", type=int, @@ -116,17 +126,32 @@ def main() -> int: from scraper import ( build_postcode_coords, build_postcode_index, + filter_londonish_outcodes, load_outcodes, run_scrape, ) - outcodes = load_outcodes() - if args.test and args.limit_outcodes is None: - preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)] - if preferred: - outcodes = preferred - if args.limit_outcodes is not None: - outcodes = outcodes[: args.limit_outcodes] + if args.outcodes is not None: + requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()] + if not requested: + raise SystemExit("--outcodes was empty") + outcodes = filter_londonish_outcodes(requested) + dropped = sorted(set(requested) - set(outcodes)) + if dropped: + log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped)) + if not outcodes: + raise SystemExit( + "None of the requested outcodes are within the Greater London-ish areas " + f"({', '.join(requested)})." + ) + else: + outcodes = load_outcodes() + if args.test and args.limit_outcodes is None: + preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)] + if preferred: + outcodes = preferred + if args.limit_outcodes is not None: + outcodes = outcodes[: args.limit_outcodes] if not outcodes: raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.") diff --git a/finder/onthemarket.py b/finder/onthemarket.py index 7a96e4e..00b90c4 100644 --- a/finder/onthemarket.py +++ b/finder/onthemarket.py @@ -10,6 +10,30 @@ Each rendered page contains 30 listings under `humanised-property-type`, `features` (a list where the first element is typically `"Tenure: "`), and `details-url`. Pagination is via `?page=N`; the loop terminates when `paginationControls.next` is null. + +Postcodes +--------- +The search card exposes only an *outcode*-level address (e.g. "Padfield Road, +London, SE5") and a map pin, so the old behaviour derived the postcode from the +nearest postcode to that pin — a guess that frequently lands on a neighbouring +unit (the pin can sit on the wrong side of a street boundary). + +Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__` +embeds the property's analytics dataLayer at +`props.initialReduxState.metadata.dataLayer`, which carries the property's own +`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by +`property-id`. Crucially this is NOT the agent's office postcode — that lives +separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and +is the classic trap when blindly scanning the page for a postcode. We read the +dataLayer postcode, verify `property-id` matches the listing, and accept it only +when its outcode agrees with the coordinate-nearest postcode (via +``resolve_listing_postcode``) — exactly the trust rule the other scrapers use. +Measured over a sample of real listings this yields a trustworthy, usually +exact-unit postcode for ~11/12 listings; the rest safely fall back to the +coordinate-nearest postcode. + +Detail fetching costs one extra HTTPS GET per listing, so it is gated behind +``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode. """ import json @@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex from transform import ( clean_listing_address, extract_full_postcode, + extract_outcode, fix_coords, map_property_type, normalize_sub_type, parse_display_size, + resolve_listing_postcode, ) log = logging.getLogger("rightmove") +# Detail-page postcode recovery (see module docstring). When enabled, each +# listing's detail page is fetched so its analytics dataLayer postcode — the +# property's own full unit postcode — can replace the coordinate-nearest guess. +# Bounded per outcode so a large outcode can't balloon into unbounded extra +# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a +# typical outcode's listings all get their real postcode rather than a +# coordinate-nearest guess. +OTM_FETCH_DETAILS = True +OTM_MAX_DETAILS_PER_OUTCODE = 400 + _NEXT_DATA_RE = re.compile( r'', re.DOTALL, @@ -51,6 +87,11 @@ _HTML_HEADERS = { "Accept-Language": "en-GB,en;q=0.9", } +# listingId -> recovered full postcode (or None). Failures are cached too so a +# broken or postcode-less detail page is not re-fetched within a run (the same +# listing can reappear across overlapping outcode searches). +_detail_postcode_cache: dict[str, str | None] = {} + def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None: """GET one search-results page and return the embedded __NEXT_DATA__ JSON. @@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict return None +def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None: + """Extract the property's own full postcode from an OnTheMarket detail page. + + Pure and network-free so it is unit-testable: callers pass `page.content()` + / the GET body and this does the parsing. + + The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at + ``props.initialReduxState.metadata.dataLayer.postcode`` and is the + property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the + agent's office postcode, which sits separately at + ``…property.agent.postcode`` — the trap when scanning a detail page for "a" + postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must + match it, guaranteeing we read this listing's postcode and not a stray one. + + Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the + page has no usable property postcode. Trust (outcode-vs-coordinates + agreement) is enforced later in ``transform_property``. + """ + if not html: + return None + + match = _NEXT_DATA_RE.search(html) + if not match: + return None + try: + data = json.loads(match.group(1)) + except json.JSONDecodeError: + return None + + try: + data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"] + except (KeyError, TypeError): + return None + if not isinstance(data_layer, dict): + return None + + # Guard against reading a different listing's postcode: the dataLayer is the + # property's own analytics payload, so its property-id must match. + if listing_id is not None: + page_id = data_layer.get("property-id") + if page_id is not None and str(page_id) != str(listing_id): + return None + + raw_postcode = data_layer.get("postcode") + if not isinstance(raw_postcode, str): + return None + return extract_full_postcode(raw_postcode) + + +def _fetch_detail_postcode( + client: httpx.Client, details_url: str, listing_id: str +) -> str | None: + """GET one listing's detail page and return its dataLayer postcode (or None). + + Results (including failures) are cached by listing id so a listing that + reappears across overlapping outcode searches is fetched at most once. Plain + HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network / + parse errors degrade gracefully to None so the caller falls back to the + coordinate-nearest postcode. + """ + if listing_id in _detail_postcode_cache: + return _detail_postcode_cache[listing_id] + + full_url = ( + ONTHEMARKET_BASE + details_url + if details_url and not details_url.startswith("http") + else details_url + ) + result: str | None = None + if full_url: + for attempt in range(MAX_RETRIES): + try: + resp = client.get( + full_url, headers=_HTML_HEADERS, follow_redirects=True + ) + except ( + httpx.ConnectError, + httpx.ReadTimeout, + httpx.WriteTimeout, + httpx.PoolTimeout, + ) as exc: + delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) + log.warning( + "%s from %s, retry %d/%d in %.1fs", + type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay, + ) + time.sleep(delay) + continue + + if resp.status_code == 200: + result = parse_detail_postcode(resp.text, listing_id) + break + if resp.status_code in (429, 500, 502, 503, 504): + delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) + log.warning( + "HTTP %d from %s, retry %d/%d in %.1fs", + resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay, + ) + time.sleep(delay) + continue + log.debug( + "OnTheMarket detail %s returned HTTP %d (no postcode)", + listing_id, resp.status_code, + ) + break + + _detail_postcode_cache[listing_id] = result + return result + + def _parse_price(price_value) -> int: """Parse a formatted price string like '£450,000' into an integer. Returns 0 for POA/auction/null values.""" @@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None: def transform_property( - raw: dict, pc_index: PostcodeSpatialIndex + raw: dict, + pc_index: PostcodeSpatialIndex, + detail_postcode: str | None = None, ) -> dict | None: - """Transform a raw OnTheMarket listing dict into our output schema.""" + """Transform a raw OnTheMarket listing dict into our output schema. + + ``detail_postcode`` is the property's own full postcode recovered from its + detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``), + or ``None`` when no detail fetch was done / no postcode was found. When + present and trustworthy (its outcode agrees with the coordinate-nearest + postcode) it supersedes the coordinate guess and is labelled + ``"detail_address"``. + """ loc = raw.get("location") or {} raw_lat = loc.get("lat") raw_lng = loc.get("lon") @@ -184,8 +345,29 @@ def transform_property( return None raw_address = raw.get("address", "") or "" extracted_postcode = extract_full_postcode(raw_address) - postcode = extracted_postcode or inferred_postcode - postcode_source = "address" if extracted_postcode else "coordinates" + + # Prefer the property's own detail-page postcode when we have one and it is + # trustworthy. The detail postcode is a full unit postcode (better than the + # coordinate-nearest guess and than the usually outcode-only card address), + # but a stale/mislabelled value would silently override the spatially + # correct one, so apply the same outcode-agreement trust rule the address + # postcode uses: keep it only when its outcode matches the + # coordinate-nearest postcode's outcode. + detail_postcode = extract_full_postcode(detail_postcode) + if detail_postcode and extract_outcode(detail_postcode) == extract_outcode( + inferred_postcode + ): + postcode, postcode_source = detail_postcode, "detail_address" + else: + if detail_postcode: + log.debug( + "OnTheMarket %s: rejecting detail postcode %s " + "(outcode mismatch with inferred %s)", + raw.get("id", "?"), detail_postcode, inferred_postcode, + ) + postcode, postcode_source = resolve_listing_postcode( + extracted_postcode, inferred_postcode + ) raw_beds = raw.get("bedrooms") or 0 raw_baths = raw.get("bathrooms") or 0 @@ -223,6 +405,10 @@ def transform_property( "Inferred postcode": inferred_postcode, "Listing raw address": raw_address, "Address per Property Register": clean_listing_address(raw_address), + # OnTheMarket search JSON exposes only a street-level address; no UPRN + # or house number/name is available without a detail-page fetch. + "UPRN": None, + "Property number or name": None, "Leasehold/Freehold": _extract_tenure(features), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), @@ -242,10 +428,17 @@ def search_outcode( pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: - """Paginate through OnTheMarket sale results for one outcode.""" + """Paginate through OnTheMarket sale results for one outcode. + + When ``OTM_FETCH_DETAILS`` is enabled, up to + ``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page + fetched for the property's own postcode (see ``_fetch_detail_postcode``); + the rest fall back to the coordinate-nearest postcode. + """ properties: list[dict] = [] seen_ids: set[str] = set() page_num = 1 + details_fetched = 0 while True: data = _fetch_page_json(client, outcode, page_num) @@ -269,8 +462,22 @@ def search_outcode( if listing_id and listing_id in seen_ids: continue seen_ids.add(listing_id) + + detail_postcode = None + if OTM_FETCH_DETAILS and listing_id: + # Cached lookups are free; only fresh GETs count toward the cap + # and incur the inter-request delay. + cached = listing_id in _detail_postcode_cache + if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE: + detail_postcode = _fetch_detail_postcode( + client, raw.get("details-url") or "", listing_id + ) + if not cached: + details_fetched += 1 + time.sleep(DELAY_BETWEEN_PAGES) + try: - transformed = transform_property(raw, pc_index) + transformed = transform_property(raw, pc_index, detail_postcode) except Exception as exc: log.warning( "OnTheMarket %s property %s failed to transform: %s", diff --git a/finder/rightmove.py b/finder/rightmove.py index 883c68a..956b73d 100644 --- a/finder/rightmove.py +++ b/finder/rightmove.py @@ -1,4 +1,6 @@ +import json import logging +import re import time import httpx @@ -6,12 +8,15 @@ import httpx from constants import ( PAGE_SIZE, DELAY_BETWEEN_PAGES, + RIGHTMOVE_DETAIL_URL, + RIGHTMOVE_FETCH_DETAILS, + RIGHTMOVE_MAX_DETAILS_PER_OUTCODE, SEARCH_URL, TYPEAHEAD_URL, ) from http_client import fetch_with_retry from spatial import PostcodeSpatialIndex -from transform import transform_property +from transform import extract_full_postcode, normalize_postcode, transform_property log = logging.getLogger("rightmove") @@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {} _MAX_INDEX = 1008 +# --------------------------------------------------------------------------- +# Detail-page postcode extraction +# --------------------------------------------------------------------------- +# +# The search API (_paginate) only returns an outcode-level `displayAddress` +# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each +# listing's detail page, however, embeds the property's OWN full postcode in a +# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}` +# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by +# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html"). +# This is the property's own postcode, NOT a nearest station/school: the +# `nearestStations`/`nearestAirports` arrays carry only names + distances, no +# postcodes, and the address outcode always matches the searched outcode. +# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT +# new-builds) found the full postcode present 100% of the time. There is no +# UPRN or house-number field anywhere in propertyData, so those stay None. +# +# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is +# a JSON STRING holding a flat array where every integer inside a container is +# an index reference into that same array (so the graph can dedupe). We +# brace-match the (large, deeply-nested) object literal — a non-greedy regex +# cannot — then rehydrate the reference graph before reading the address. + +_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*") + + +def _extract_page_model_literal(html: str) -> str | None: + """Return the `{...}` object literal assigned to window.__PAGE_MODEL. + + Brace-matches with string/escape awareness so embedded braces and quotes in + string values don't end the match early. Returns None when absent.""" + marker = _PAGE_MODEL_RE.search(html) + if not marker: + return None + start = marker.end() + if start >= len(html) or html[start] != "{": + return None + depth = 0 + in_str = False + esc = False + for j in range(start, len(html)): + ch = html[j] + if in_str: + if esc: + esc = False + elif ch == "\\": + esc = True + elif ch == '"': + in_str = False + elif ch == '"': + in_str = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return html[start : j + 1] + return None + + +def _rehydrate(flat: list) -> object: + """Resolve a devalue-style flattened reference array into a nested object. + + Index 0 is the root; every int inside a dict/list is an index back into + ``flat``. Memoised so shared/cyclic references resolve once.""" + cache: dict[int, object] = {} + + def resolve(idx: int) -> object: + if not isinstance(idx, int) or idx < 0 or idx >= len(flat): + return None + if idx in cache: + return cache[idx] + node = flat[idx] + if isinstance(node, dict): + out: dict = {} + cache[idx] = out + for key, value in node.items(): + out[key] = resolve(value) if isinstance(value, int) else value + return out + if isinstance(node, list): + arr: list = [] + cache[idx] = arr + for value in node: + arr.append(resolve(value) if isinstance(value, int) else value) + return arr + cache[idx] = node + return node + + return resolve(0) + + +def parse_detail_postcode(html: str) -> str | None: + """Extract a Rightmove property's TRUE full postcode from its detail HTML. + + Pure and network-free so it is unit-testable: callers pass the page HTML. + Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL + and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the + page has no parseable address (the property location wrapper can be empty — + the caller then keeps the coordinate fallback). The returned outcode is + re-validated against the joined postcode so a malformed incode is dropped. + """ + if not html: + return None + literal = _extract_page_model_literal(html) + if not literal: + return None + try: + outer = json.loads(literal) + flat = json.loads(outer["data"]) + except (ValueError, KeyError, TypeError): + return None + if not isinstance(flat, list) or not flat: + return None + + root = _rehydrate(flat) + if not isinstance(root, dict): + return None + property_data = root.get("propertyData") + if not isinstance(property_data, dict): + return None + address = property_data.get("address") + if not isinstance(address, dict): + return None + + outcode = address.get("outcode") + incode = address.get("incode") + if not isinstance(outcode, str) or not isinstance(incode, str): + return None + outcode, incode = outcode.strip(), incode.strip() + if not outcode or not incode: + return None + + # Round-trip through the shared postcode validator/normaliser: this both + # canonicalises spacing and rejects an outcode/incode pair that doesn't form + # a structurally-valid UK postcode. + return extract_full_postcode(normalize_postcode(f"{outcode} {incode}")) + + +# listingId -> true full postcode (or None when unavailable). Failures are +# cached too, so a broken/duplicate listing is fetched at most once per run (the +# same listing can reappear across overlapping outcode searches). +_detail_postcode_cache: dict[str, str | None] = {} + + +def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None: + """GET a listing detail page and return its true full postcode (or None). + + Results (including failures) are cached by listing id. The detail page is a + plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call + suffices; any error degrades gracefully to the coordinate fallback.""" + if not property_id: + return None + if property_id in _detail_postcode_cache: + return _detail_postcode_cache[property_id] + + postcode: str | None = None + url = RIGHTMOVE_DETAIL_URL.format(id=property_id) + try: + resp = client.get(url, headers={"Accept": "text/html"}) + if resp.status_code == 200: + postcode = parse_detail_postcode(resp.text) + else: + log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code) + except httpx.HTTPError as exc: + log.debug("Rightmove detail fetch failed %s: %s", url, exc) + + _detail_postcode_cache[property_id] = postcode + return postcode + + def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" if outcode in outcode_cache: @@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: return None +def _detail_postcode_for( + client: httpx.Client, + prop: dict, + fetch_details: bool, + detail_budget: dict, +) -> str | None: + """Look up a listing's true postcode, honouring the per-outcode fetch cap. + + Cached listings are always served (they cost neither a cap slot nor a GET); + a fresh fetch is made only while ``detail_budget['remaining'] > 0``.""" + if not fetch_details: + return None + property_id = str(prop.get("id") or "") + if not property_id: + return None + if property_id in _detail_postcode_cache: + return _detail_postcode_cache[property_id] + if detail_budget["remaining"] <= 0: + return None + detail_budget["remaining"] -= 1 + postcode = _fetch_detail_postcode(client, property_id) + time.sleep(DELAY_BETWEEN_PAGES) + return postcode + + def _paginate( client: httpx.Client, outcode_id: str, @@ -51,11 +251,19 @@ def _paginate( channel_cfg: dict, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, + fetch_details: bool = False, + detail_cap: int = 0, ) -> tuple[list[dict], int]: - """Paginate through search results. Returns (properties, result_count).""" + """Paginate through search results. Returns (properties, result_count). + + When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have + their detail page fetched for the property's TRUE full postcode (see + ``parse_detail_postcode``); the rest fall back to coordinate-derived + postcodes.""" properties = [] index = 0 result_count = 0 + detail_budget = {"remaining": detail_cap} while True: params = { @@ -82,7 +290,12 @@ def _paginate( for prop in raw_props: try: - transformed = transform_property(prop, outcode, pc_index) + detail_postcode = _detail_postcode_for( + client, prop, fetch_details, detail_budget + ) + transformed = transform_property( + prop, outcode, pc_index, detail_postcode=detail_postcode + ) except Exception as exc: log.warning( "Rightmove %s/%s property %s failed to transform: %s", @@ -127,7 +340,12 @@ def search_outcode( pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: - """Paginate through unfiltered sale results for one outcode+channel.""" + """Paginate through unfiltered sale results for one outcode+channel. + + Each listing's detail page is fetched for the property's TRUE full postcode + (gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by + ``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the + coordinate-derived postcode.""" properties, _ = _paginate( client, outcode_id, @@ -135,6 +353,8 @@ def search_outcode( channel_cfg, pc_index, max_properties=max_properties, + fetch_details=RIGHTMOVE_FETCH_DETAILS, + detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE, ) if max_properties is not None and len(properties) >= max_properties: diff --git a/finder/scraper.py b/finder/scraper.py index 1111577..25ddf90 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -15,6 +15,10 @@ from constants import ( DATA_DIR, DELAY_BETWEEN_OUTCODES, LONDON_OUTCODE_PREFIXES, + ZOOPLA_DETAIL_BUDGET_FRACTION, + ZOOPLA_FETCH_DETAILS, + ZOOPLA_FETCHER, + ZOOPLA_MAX_DETAILS_PER_OUTCODE, ) from http_client import make_client @@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int: return timeout +def _zoopla_detail_cap() -> int: + """Max detail-page fetches per outcode (0 disables detail fetching). + + Zoopla search cards only expose an outcode-level address, so the full + postcode/coordinates come from each listing's detail page. The cap bounds + the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS + (the per-outcode SIGALRM budget covers the detail fetches too). Configure via + ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py.""" + return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0 + + +def _open_zoopla_detail_tab(page, detail_cap: int): + """Open a second tab on the same context for detail-page fetches. + + Sharing the persistent context means the detail tab inherits the search + tab's Cloudflare clearance cookies. Returns None when detail fetching is + disabled or the tab cannot be created (the scrape then degrades to + outcode-level postcodes rather than failing).""" + if detail_cap <= 0: + return None + try: + return page.context.new_page() + except Exception as exc: + log.warning( + "Zoopla detail tab unavailable (%s); using outcode-level postcodes", + _exception_detail(exc), + ) + return None + + @contextmanager def _wall_clock_timeout(seconds: int, label: str): """SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry. @@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None: log.warning("%s browser force-close failed: %s", label, _exception_detail(exc)) +def _scrape_zoopla_flaresolverr( + outcodes: list[str], + pc_index: PostcodeSpatialIndex, + pc_coords: dict[str, tuple[float, float]], + results: dict[str, list[dict]], + errors: list[str], + max_properties_per_source: int | None, +) -> None: + """Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC).""" + from flaresolverr import FlareSolverrError, FlareSolverrSession + from zoopla_flaresolverr import search_outcode as fs_search_outcode + + try: + session = FlareSolverrSession(session="zoopla") + session.__enter__() + except FlareSolverrError as exc: + errors.append(f"zoopla: FlareSolverr unavailable: {exc}") + log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc) + return + + try: + for outcode in outcodes: + remaining = _source_remaining(results, "zoopla", max_properties_per_source) + if remaining == 0: + log.info("Zoopla cap reached") + return + try: + props, _ = fs_search_outcode( + outcode, + pc_index, + pc_coords, + session, + max_properties=remaining, + detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE, + ) + added = _store_properties(results, "zoopla", props, max_properties_per_source) + log.info("Zoopla %s: +%d", outcode, added) + except Exception as exc: # noqa: BLE001 - one outcode must not kill the run + _record_error(errors, "zoopla", outcode, exc) + time.sleep(DELAY_BETWEEN_OUTCODES) + finally: + session.__exit__(None, None, None) + + def _scrape_zoopla( outcodes: list[str], pc_index: PostcodeSpatialIndex, @@ -446,6 +524,12 @@ def _scrape_zoopla( errors: list[str], max_properties_per_source: int | None, ) -> None: + if ZOOPLA_FETCHER == "flaresolverr": + _scrape_zoopla_flaresolverr( + outcodes, pc_index, pc_coords, results, errors, max_properties_per_source + ) + return + try: browser, page = _launch_zoopla_with_retries() except Exception as exc: @@ -454,6 +538,12 @@ def _scrape_zoopla( return outcode_timeout = _zoopla_outcode_timeout_seconds() + detail_cap = _zoopla_detail_cap() + detail_page = _open_zoopla_detail_tab(page, detail_cap) + # Spend at most a fraction of each outcode's budget on detail fetches so the + # SIGALRM guard never trips mid-outcode and discards already-collected + # search listings; the rest is left for search pagination and transform. + detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION) try: for outcode in outcodes: @@ -470,6 +560,9 @@ def _scrape_zoopla( pc_index, pc_coords, max_properties=None, + detail_page=detail_page, + detail_cap=detail_cap, + detail_budget_seconds=detail_budget_seconds, ) added = _store_properties( results, @@ -496,6 +589,8 @@ def _scrape_zoopla( _close_zoopla_browser(browser, f"zoopla {outcode}") try: browser, page = _launch_zoopla_with_retries() + # The old context (and its detail tab) is gone; reopen one. + detail_page = _open_zoopla_detail_tab(page, detail_cap) log.info("Zoopla %s retrying with fresh browser", outcode) except Exception as relaunch_exc: _record_error(errors, "zoopla", outcode, relaunch_exc) @@ -503,6 +598,11 @@ def _scrape_zoopla( time.sleep(DELAY_BETWEEN_OUTCODES) finally: + if detail_page is not None: + try: + detail_page.close() + except Exception: + pass _close_zoopla_browser(browser, "zoopla final") diff --git a/finder/storage.py b/finder/storage.py index 6c6822e..13008eb 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None: "Address per Property Register": [ p["Address per Property Register"] for p in properties ], + # UPRN (when the scraper recovered it) keys an exact listing->EPC + # join; Property number or name is the house identifier for the + # Price-Paid address join. Both are None for sources/listings without + # a detail-page fetch. + "UPRN": [p.get("UPRN") for p in properties], + "Property number or name": [ + p.get("Property number or name") for p in properties + ], "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties], "Property type": [p["Property type"] for p in properties], "Property sub-type": [p["Property sub-type"] for p in properties], @@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None: "Inferred postcode": pl.Utf8, "Listing raw address": pl.Utf8, "Address per Property Register": pl.Utf8, + "UPRN": pl.Utf8, + "Property number or name": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Property type": pl.Utf8, "Property sub-type": pl.Utf8, diff --git a/finder/test_onthemarket.py b/finder/test_onthemarket.py new file mode 100644 index 0000000..4e0b3dd --- /dev/null +++ b/finder/test_onthemarket.py @@ -0,0 +1,206 @@ +"""Tests for the OnTheMarket scraper's detail-page postcode recovery. + +`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode +or None), so these tests use a trimmed but faithful copy of a real OnTheMarket +detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure: +the property's own postcode lives in the analytics dataLayer +(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office +postcode sits separately under `…property.agent.postcode` — the trap we must not +fall into. +""" + +import json + +import onthemarket +from onthemarket import parse_detail_postcode, transform_property + + +class _StubIndex: + """Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode.""" + + def __init__(self, postcode: str | None): + self._postcode = postcode + + def nearest(self, lat: float, lng: float) -> str | None: + return self._postcode + + +def _detail_html( + *, + property_id: int = 19522441, + datalayer_postcode: str = "SE5 9AA", + agent_postcode: str = "SE5 8RS", +) -> str: + """Build detail-page HTML with a real-shaped __NEXT_DATA__ payload.""" + next_data = { + "props": { + "initialReduxState": { + "metadata": { + "dataLayer": { + "page-type": "details-section", + "property-type": "homes", + # The property's own unit postcode. + "postcode": datalayer_postcode, + "property-id": property_id, + "price": "275,000", + "addressline_2": "Padfield Road", + } + }, + "property": { + "displayAddress": "Padfield Road, London, SE5", + "location": {"lon": -0.100233, "lat": 51.466129}, + # The agent block carries the AGENT'S office postcode — the + # trap. parse_detail_postcode must not return this. + "agent": { + "address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS", + "postcode": agent_postcode, + }, + }, + } + } + } + payload = json.dumps(next_data) + return ( + "" + '" + ) + + +# --------------------------------------------------------------------------- +# parse_detail_postcode +# --------------------------------------------------------------------------- + + +def test_parse_returns_property_postcode_not_agent(): + html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS") + assert parse_detail_postcode(html, "19522441") == "SE5 9AA" + + +def test_parse_normalizes_spacing(): + html = _detail_html(datalayer_postcode="se59aa") + assert parse_detail_postcode(html, "19522441") == "SE5 9AA" + + +def test_parse_ignores_mismatched_property_id(): + # dataLayer postcode belongs to property 19522441; asking for a different + # listing id must refuse to return it. + html = _detail_html(property_id=19522441) + assert parse_detail_postcode(html, "99999999") is None + + +def test_parse_accepts_when_no_listing_id_given(): + html = _detail_html(datalayer_postcode="SE5 9AA") + assert parse_detail_postcode(html, None) == "SE5 9AA" + + +def test_parse_handles_missing_postcode(): + html = _detail_html(datalayer_postcode="") + assert parse_detail_postcode(html, "19522441") is None + + +def test_parse_handles_no_next_data(): + assert parse_detail_postcode("no script here", "1") is None + + +def test_parse_handles_empty_html(): + assert parse_detail_postcode("", "1") is None + + +def test_parse_handles_malformed_json(): + html = ( + '' + ) + assert parse_detail_postcode(html, "1") is None + + +def test_parse_handles_missing_datalayer(): + next_data = {"props": {"initialReduxState": {"metadata": {}}}} + html = ( + '" + ) + assert parse_detail_postcode(html, "1") is None + + +# --------------------------------------------------------------------------- +# transform_property — detail postcode wiring + trust rule +# --------------------------------------------------------------------------- + + +_RAW_LISTING = { + "id": "19522441", + "address": "Padfield Road, London, SE5", + "location": {"lon": -0.100233, "lat": 51.466129}, + "bedrooms": 2, + "bathrooms": 1, + "price": "£275,000", + "humanised-property-type": "Apartment", + "features": ["Tenure: Leasehold (99 years remaining)"], + "details-url": "/details/19522441/", +} + + +def test_transform_uses_trusted_detail_postcode(): + # Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust + # the (more precise) detail postcode and label it detail_address. + index = _StubIndex("SE5 1AA") + out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA") + assert out is not None + assert out["Postcode"] == "SE5 9AA" + assert out["Postcode source"] == "detail_address" + + +def test_transform_rejects_detail_postcode_on_outcode_mismatch(): + # Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different + # outcode -> reject the detail postcode, fall back to coordinate logic. + index = _StubIndex("SE5 1AA") + out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ") + assert out is not None + assert out["Postcode"] == "SE5 1AA" + assert out["Postcode source"] == "coordinates" + + +def test_transform_without_detail_postcode_uses_coordinates(): + index = _StubIndex("SE5 1AA") + out = transform_property(_RAW_LISTING, index, detail_postcode=None) + assert out is not None + assert out["Postcode"] == "SE5 1AA" + assert out["Postcode source"] == "coordinates" + # No UPRN / house number is recoverable from OnTheMarket. + assert out["UPRN"] is None + assert out["Property number or name"] is None + + +def test_transform_detail_postcode_via_search_address_outcode(): + # When the card address already carries a full postcode that agrees with the + # coordinates, the existing "address" source still wins absent a detail + # postcode — detail recovery never regresses that path. + raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA") + index = _StubIndex("SE5 1AA") + out = transform_property(raw, index, detail_postcode=None) + assert out["Postcode"] == "SE5 1AA" + assert out["Postcode source"] == "address" + + +# --------------------------------------------------------------------------- +# _fetch_detail_postcode caching (no real network) +# --------------------------------------------------------------------------- + + +def test_fetch_detail_postcode_is_cached(monkeypatch): + onthemarket._detail_postcode_cache.clear() + onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA" + + def _boom(*args, **kwargs): # pragma: no cover - must never be called + raise AssertionError("network was hit despite a cached value") + + # Any httpx use would explode; the cache hit must short-circuit first. + result = onthemarket._fetch_detail_postcode( + client=type("C", (), {"get": _boom})(), + details_url="/details/19522441/", + listing_id="19522441", + ) + assert result == "SE5 9AA" + onthemarket._detail_postcode_cache.clear() diff --git a/finder/test_rightmove.py b/finder/test_rightmove.py new file mode 100644 index 0000000..2ea382b --- /dev/null +++ b/finder/test_rightmove.py @@ -0,0 +1,113 @@ +"""Tests for the Rightmove detail-page postcode extractor. + +The search API only returns an outcode-level ``displayAddress``; the property's +TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as +``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers +it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object +graph whose ``data`` field is a JSON STRING of a flat array where every integer +inside a container is an index reference into that same array. +""" + +import json + +from rightmove import _extract_page_model_literal, parse_detail_postcode + + +def _page_model_html(flat: list, *, encoding: str = "json") -> str: + """Wrap a flattened object-graph array in a realistic detail-page \n" + "" + ) + + +# A faithful slice of a real listing: root -> propertyData -> address, with a +# decoy nearestStations array (which carries NO postcodes on the live page) to +# prove the parser anchors on the property's own address, not a nearby POI. +_FLAT_SW9 = [ + {"propertyData": 1}, # 0: root + { + "id": "89089584", + "address": 2, + "location": 4, + "nearestStations": 6, + }, # 1: propertyData + { + "displayAddress": "Caldwell Street, Stockwell", + "countryCode": "GB", + "ukCountry": "England", + "outcode": "SW9", + "incode": "0HD", + }, # 2: address + None, # 3: filler + { + "latitude": 51.477238, + "longitude": -0.116819, + "pinType": "ACCURATE_POINT", + }, # 4: location + None, # 5: filler + [7, 8], # 6: nearestStations (references) + {"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode + {"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode +] + + +def test_parses_full_postcode_from_outcode_and_incode() -> None: + html = _page_model_html(_FLAT_SW9) + assert parse_detail_postcode(html) == "SW9 0HD" + + +def test_extract_page_model_literal_brace_matches_nested_object() -> None: + # The literal must include the whole nested object, not stop at the first + # closing brace inside the escaped data string. + html = _page_model_html(_FLAT_SW9) + literal = _extract_page_model_literal(html) + assert literal is not None + assert literal.startswith("{") and literal.endswith("}") + # Round-trips back to a dict with the expected top-level keys. + assert set(json.loads(literal)) == {"data", "encoding"} + + +def test_normalises_unspaced_incode() -> None: + flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9] + flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"} + assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH" + + +def test_returns_none_when_address_missing() -> None: + # The location wrapper can be empty/absent on some listings; the caller then + # keeps the coordinate fallback, so we must return None (not raise). + flat = [ + {"propertyData": 1}, + {"id": "1", "location": 2}, + {"latitude": 51.5, "longitude": -0.1}, + ] + assert parse_detail_postcode(_page_model_html(flat)) is None + + +def test_returns_none_when_incode_blank() -> None: + flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9] + flat[2] = {**_FLAT_SW9[2], "incode": ""} + assert parse_detail_postcode(_page_model_html(flat)) is None + + +def test_returns_none_for_non_postcode_pair() -> None: + # A structurally-invalid outcode/incode pair is rejected by the validator. + flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9] + flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"} + assert parse_detail_postcode(_page_model_html(flat)) is None + + +def test_returns_none_without_page_model() -> None: + assert parse_detail_postcode("") is None + assert parse_detail_postcode("no model") is None + # Malformed JSON in the data field degrades gracefully. + broken = '' + assert parse_detail_postcode(broken) is None diff --git a/finder/test_transform.py b/finder/test_transform.py index c90296b..1da0a7e 100644 --- a/finder/test_transform.py +++ b/finder/test_transform.py @@ -1,13 +1,19 @@ from transform import ( + build_register_address, clean_listing_address, extract_full_postcode, + extract_outcode, + resolve_listing_postcode, transform_property, ) class StubPostcodeIndex: + def __init__(self, postcode: str = "SW1A 9ZZ") -> None: + self._postcode = postcode + def nearest(self, lat: float, lng: float) -> str: - return "SW1A 9ZZ" + return self._postcode def test_extract_full_postcode_normalizes_spacing() -> None: @@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None: assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley" +def test_build_register_address_prepends_house_number_or_name() -> None: + # House number/name prepended, with the trailing outcode/postcode stripped. + assert ( + build_register_address("South Street, Bromley BR1", "12") + == "12, South Street, Bromley" + ) + assert ( + build_register_address("Riverside, Martham NR29", "Martham Mill") + == "Martham Mill, Riverside, Martham" + ) + # No number/name -> identical to the plain cleaned address. + assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley" + # Already starts with the number/name -> no duplication. + assert ( + build_register_address("12 South Street, Bromley", "12") + == "12 South Street, Bromley" + ) + # Empty/whitespace number/name is ignored. + assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley" + + +def test_extract_outcode() -> None: + assert extract_outcode("SW1A 2AA") == "SW1A" + assert extract_outcode("n4 2ha") == "N4" + assert extract_outcode("SW1A2AA") == "SW1A" + assert extract_outcode(None) is None + assert extract_outcode("") is None + + +def test_resolve_listing_postcode() -> None: + # Outcode matches -> trust the more precise extracted postcode. + assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address") + # Outcode mismatch -> fall back to the spatially-correct inferred postcode. + assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates") + # Well-formed but fabricated postcode in a different outcode is rejected. + assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates") + # No extracted postcode -> inferred is authoritative. + assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates") + + def test_rightmove_transform_prefers_postcode_from_display_address() -> None: prop = { "id": "123", @@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None: assert result["Inferred postcode"] == "SW1A 9ZZ" assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA" assert result["Address per Property Register"] == "Flat 2, 10 Downing Street" + + +def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None: + prop = { + "id": "124", + "location": {"latitude": 51.5, "longitude": -0.1}, + "price": {"amount": 750000, "displayPrices": []}, + "propertySubType": "Terraced", + "bedrooms": 3, + "bathrooms": 1, + "keyFeatures": [], + "propertyUrl": "/properties/124", + # Address postcode is in a different outcode than the coordinate-nearest one. + "displayAddress": "10 Downing Street, E14 9SS", + } + + result = transform_property(prop, "SW1A", StubPostcodeIndex()) + + assert result is not None + # The spatially-correct inferred postcode wins over the mismatching extracted one. + assert result["Postcode"] == "SW1A 9ZZ" + assert result["Postcode source"] == "coordinates" + assert result["Extracted postcode"] == "E14 9SS" + + +def _rightmove_prop() -> dict: + return { + "id": "200", + "location": {"latitude": 51.5, "longitude": -0.1}, + "price": {"amount": 750000, "displayPrices": []}, + "propertySubType": "Terraced", + "bedrooms": 3, + "bathrooms": 1, + "keyFeatures": [], + "propertyUrl": "/properties/200", + # Search API only ever exposes the outcode in the display address. + "displayAddress": "Caldwell Street, Stockwell, SW9", + } + + +def test_rightmove_transform_prefers_detail_postcode() -> None: + # The detail page's true full postcode (same outcode as the location) is + # preferred over the coordinate-nearest guess. + result = transform_property( + _rightmove_prop(), + "SW9", + StubPostcodeIndex("SW9 7AA"), + detail_postcode="SW9 0HD", + ) + + assert result is not None + assert result["Postcode"] == "SW9 0HD" + assert result["Postcode source"] == "detail_address" + # The coordinate inference is still surfaced separately. + assert result["Inferred postcode"] == "SW9 7AA" + + +def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None: + # A detail postcode whose outcode disagrees with the location must not + # relocate the listing; the coordinate postcode wins instead. + result = transform_property( + _rightmove_prop(), + "SW9", + StubPostcodeIndex("SW9 7AA"), + detail_postcode="E14 9SS", + ) + + assert result is not None + assert result["Postcode"] == "SW9 7AA" + assert result["Postcode source"] == "coordinates" + + +def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None: + # No detail postcode -> behaviour is unchanged (coordinate-nearest). + result = transform_property( + _rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA") + ) + + assert result is not None + assert result["Postcode"] == "SW9 7AA" + assert result["Postcode source"] == "coordinates" diff --git a/finder/test_zoopla.py b/finder/test_zoopla.py new file mode 100644 index 0000000..228e21b --- /dev/null +++ b/finder/test_zoopla.py @@ -0,0 +1,288 @@ +from zoopla import _detail_cache_key, parse_detail_geo, transform_property + + +def test_detail_cache_key_uses_listing_id() -> None: + assert _detail_cache_key("/for-sale/details/59888978/") == "59888978" + assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978" + # No id in the URL -> fall back to the URL itself as the key. + assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/" + + +class StubPostcodeIndex: + """Spatial index stub whose nearest-lookup returns a fixed postcode.""" + + def __init__(self, postcode: str = "BR1 2AB") -> None: + self._postcode = postcode + + def nearest(self, lat: float, lng: float) -> str: + return self._postcode + + +# London-ish postcodes with coordinates, plus the Norfolk sample used by the +# verified detail-page snippet (well inside the England bounds check). +PC_COORDS = { + "BR1 2AB": (51.40, 0.01), + "SW1A 1AA": (51.50, -0.14), + "NR29 4RG": (52.716014, 1.614495), +} + +# Verified RSC `location` object (listing 59888978), as it appears escaped inside +# a self.__next_f flight chunk in page.content(). +_LOCATION_ESCAPED = ( + '' +) + + +def test_parse_detail_geo_location_object_escaped() -> None: + geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29") + assert geo == { + "lat": 52.716014, + "lng": 1.614495, + "postcode": "NR29 4RG", + "outcode": "NR29", + "source": "detail_location", + "uprn": "10023461458", + "number_or_name": "Martham Mill", + # No `address` twin in this snippet, so there is no full street address. + "full_address": None, + } + + +def test_parse_detail_geo_location_object_unescaped() -> None: + html = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"uprn":"10023461458","postalCode":"NR29 4RG"}' + ) + geo = parse_detail_geo(html) + assert geo is not None + assert geo["source"] == "detail_location" + assert geo["postcode"] == "NR29 4RG" + + +def test_parse_detail_geo_address_twin() -> None: + html = ( + '"address":{"fullAddress":"Riverside, Martham NR29",' + '"latitude":52.716014,"longitude":1.614495,' + '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}' + ) + geo = parse_detail_geo(html) + assert geo is not None + assert geo["source"] == "detail_address_obj" + assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG") + assert geo["uprn"] == "10023461458" + assert geo["full_address"] == "Riverside, Martham NR29" + + +def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None: + # Real detail pages carry both wrappers: the `location` object holds the + # uprn + house number/name, the `address` twin holds the full street + # address. They share a uprn, so the twin's fullAddress is attached. + html = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"uprn":"10023461458","postalCode":"NR29 4RG",' + '"propertyNumberOrName":"Martham Mill"}' + '"address":{"fullAddress":"Riverside, Martham NR29",' + '"latitude":52.716014,"longitude":1.614495,' + '"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}' + ) + geo = parse_detail_geo(html) + assert geo is not None + assert geo["source"] == "detail_location" + assert geo["uprn"] == "10023461458" + assert geo["number_or_name"] == "Martham Mill" + assert geo["full_address"] == "Riverside, Martham NR29" + + +def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None: + # The only `address` twin on the page belongs to a different uprn (a + # comparable listing). With a uprn to match on, an unrelated twin is never + # borrowed — full_address stays None rather than grabbing the wrong street. + html = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"uprn":"10023461458","postalCode":"NR29 4RG"}' + '"address":{"fullAddress":"Some Comparable, Elsewhere EN2",' + '"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}' + ) + geo = parse_detail_geo(html) + assert geo is not None + assert geo["uprn"] == "10023461458" + assert geo["full_address"] is None + + +def test_parse_detail_geo_ignores_poi_coordinates() -> None: + # A charger POI (its coordinates NOT wrapped in a "location" object) followed + # by the property's own "location" wrapper. Anchoring on the wrapper means + # the POI's coordinates are ignored and the property's are returned. + poi = ( + '"name":"Martham Community Centre","numberOfConnectors":2,' + '"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}' + ) + prop = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"uprn":"10023461458","postalCode":"NR29 4RG"}' + ) + geo = parse_detail_geo(poi + prop) + assert geo is not None + assert geo["source"] == "detail_location" + # The property's coords win, not the community centre's. + assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495) + assert geo["postcode"] == "NR29 4RG" + + +def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None: + # Page embeds two location objects (e.g. a comparable then the property). + # With a search outcode, the one in that outcode is preferred; without one, + # the first (document order = primary listing) is returned. + comparable = ( + '"location":{"outcode":"EN2",' + '"coordinates":{"latitude":51.65,"longitude":-0.08},' + '"postalCode":"EN2 6SN"}' + ) + target = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"postalCode":"NR29 4RG"}' + ) + geo = parse_detail_geo(comparable + target, search_outcode="NR29") + assert geo is not None and geo["postcode"] == "NR29 4RG" + geo_first = parse_detail_geo(comparable + target) + assert geo_first is not None and geo_first["postcode"] == "EN2 6SN" + + +def test_parse_detail_geo_rejects_out_of_england() -> None: + html = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":10.0,"longitude":10.0},' + '"uprn":"1","postalCode":"NR29 4RG"}' + ) + assert parse_detail_geo(html) is None + + +def test_parse_detail_geo_drops_inconsistent_postcode() -> None: + # postalCode outcode (AB12) disagrees with the object's own outcode (NR29): + # keep the coordinates, drop the untrustworthy postcode. + html = ( + '"location":{"outcode":"NR29",' + '"coordinates":{"latitude":52.716014,"longitude":1.614495},' + '"uprn":"1","postalCode":"AB12 3CD"}' + ) + geo = parse_detail_geo(html) + assert geo is not None + assert geo["lat"] == 52.716014 + assert geo["postcode"] is None + + +def test_parse_detail_geo_returns_none_for_garbage() -> None: + assert parse_detail_geo("no data here") is None + assert parse_detail_geo("") is None + # Coordinates that are not inside a property location/address wrapper (e.g. + # only an unwrapped POI) yield nothing — safe degradation to the outcode. + assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None + + +def _raw(**overrides) -> dict: + raw = { + "id": "123", + "url": "/for-sale/details/123/", + "address": "South Street, Bromley BR1", + "price": 500000, + "beds": 2, + "baths": 1, + "property_type": "Flat", + } + raw.update(overrides) + return raw + + +def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None: + detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"} + result = transform_property( + _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail + ) + assert result is not None + # Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted. + assert result["Postcode"] == "BR1 3CD" + assert result["Postcode source"] == "detail_address" + assert result["Inferred postcode"] == "BR1 2AB" + assert (result["lat"], result["lon"]) == (51.401, 0.011) + + +def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None: + detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"} + result = transform_property( + _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail + ) + assert result is not None + # Mismatching detail postcode is rejected in favour of the spatial value. + assert result["Postcode"] == "BR1 2AB" + assert result["Postcode source"] == "detail_coordinates" + + +def test_transform_geocodes_detail_postcode_without_coordinates() -> None: + detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"} + result = transform_property( + _raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail + ) + assert result is not None + assert result["Postcode"] == "SW1A 1AA" + assert result["Postcode source"] == "detail_address" + assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"] + + +def test_transform_without_detail_falls_back_to_search_outcode() -> None: + # No detail, address has no recognizable outcode -> coarse search-outcode centroid. + result = transform_property( + _raw(address="A street with no postcode"), + StubPostcodeIndex(), + PC_COORDS, + search_outcode="BR1", + detail=None, + ) + assert result is not None + assert result["Postcode"] == "BR1 2AB" + assert result["Postcode source"] == "search_outcode" + # No detail page -> no UPRN / house number recovered. + assert result["UPRN"] is None + assert result["Property number or name"] is None + + +def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None: + detail = { + "lat": 51.401, + "lng": 0.011, + "postcode": "BR1 3CD", + "outcode": "BR1", + "uprn": "100023461458", + "number_or_name": "12", + "full_address": "South Street, Bromley BR1", + } + result = transform_property( + _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail + ) + assert result is not None + assert result["UPRN"] == "100023461458" + assert result["Property number or name"] == "12" + # The detail full address replaces the outcode-level card address, and the + # house number is prepended for a near-exact Property Register match. + assert result["Listing raw address"] == "South Street, Bromley BR1" + assert result["Address per Property Register"] == "12, South Street, Bromley" + + +def test_transform_ignores_out_of_england_detail_coords() -> None: + detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"} + result = transform_property( + _raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail + ) + assert result is not None + # Bad detail coords are discarded; falls through to the address outcode (BR1). + assert result["Postcode source"] == "address_outcode" + assert 49 <= result["lat"] <= 56 diff --git a/finder/transform.py b/finder/transform.py index 8c1f357..49986ba 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None: return normalize_postcode(match.group(1)) +def extract_outcode(postcode: str | None) -> str | None: + """Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'.""" + if not postcode: + return None + normalized = normalize_postcode(postcode) + outcode = normalized.split(" ", 1)[0] + return outcode or None + + +def resolve_listing_postcode( + extracted_postcode: str | None, inferred_postcode: str +) -> tuple[str, str]: + """Pick the authoritative postcode for a listing, returning (postcode, source). + + The address-extracted postcode is more precise than the coordinate-nearest one, + but it is only trustworthy when it agrees with the location: a stale, mistyped or + well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently + override the spatially-correct value. Since the spatial index only supports + nearest-lookup, accept the extracted postcode only when its outcode matches the + inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the + inferred one, which is always a real, plausibly-correct postcode. + """ + if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode( + inferred_postcode + ): + return extracted_postcode, "address" + if extracted_postcode: + log.debug( + "Rejecting extracted postcode %s (outcode mismatch with inferred %s)", + extracted_postcode, + inferred_postcode, + ) + return inferred_postcode, "coordinates" + + def clean_listing_address(address: str | None) -> str: """Remove postcode/outcode suffixes from listing display addresses. @@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str: return cleaned.strip(" ,") +def build_register_address( + raw_address: str | None, number_or_name: str | None = None +) -> str: + """Build a Property Register-style address, prepending the house number/name. + + Listing display addresses are usually street-level ("South Street, Bromley") + because the portals hide the exact unit. When a scraper can recover the + property's own number or name (e.g. Zoopla detail pages expose + ``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address + carries the house identifier that the EPC/Price-Paid register addresses also + use — turning a fuzzy street match into a near-exact one. Falls back to the + plain cleaned address when no number/name is available. + """ + cleaned = clean_listing_address(raw_address) + if not number_or_name: + return cleaned + number_or_name = number_or_name.strip() + if not number_or_name: + return cleaned + # Avoid duplicating a number/name the display address already starts with. + if cleaned.lower().startswith(number_or_name.lower()): + return cleaned + return f"{number_or_name}, {cleaned}" if cleaned else number_or_name + + def transform_property( - prop: dict, outcode: str, pc_index: PostcodeSpatialIndex + prop: dict, + outcode: str, + pc_index: PostcodeSpatialIndex, + detail_postcode: str | None = None, ) -> dict | None: - """Transform a raw Rightmove property dict into our output schema.""" + """Transform a raw Rightmove property dict into our output schema. + + ``detail_postcode`` is the property's TRUE full postcode recovered from its + detail page (see ``rightmove.parse_detail_postcode``); the search API itself + only exposes the outcode-level ``displayAddress``. When supplied and it + agrees with the coordinate-nearest postcode's outcode, it is preferred over + the coordinate guess and recorded with source ``"detail_address"``. A + detail postcode whose outcode disagrees with the location is discarded in + favour of the spatially-correct coordinate postcode, so a stale or wrong + detail value can never silently relocate a listing. + """ loc = prop.get("location") if not loc: return None @@ -268,8 +341,25 @@ def transform_property( return None raw_address = prop.get("displayAddress", "") or "" extracted_postcode = extract_full_postcode(raw_address) - postcode = extracted_postcode or inferred_postcode - postcode_source = "address" if extracted_postcode else "coordinates" + + # Prefer the detail page's true full postcode when it agrees with the + # location; otherwise fall back to the (display-address-or-coordinate) logic. + detail_full = extract_full_postcode(detail_postcode) + if detail_full and extract_outcode(detail_full) == extract_outcode( + inferred_postcode + ): + postcode, postcode_source = detail_full, "detail_address" + else: + if detail_full: + log.debug( + "Rejecting Rightmove detail postcode %s (outcode mismatch with " + "inferred %s)", + detail_full, + inferred_postcode, + ) + postcode, postcode_source = resolve_listing_postcode( + extracted_postcode, inferred_postcode + ) property_url = prop.get("propertyUrl") or "" if not isinstance(property_url, str): @@ -291,6 +381,9 @@ def transform_property( "Inferred postcode": inferred_postcode, "Listing raw address": raw_address, "Address per Property Register": clean_listing_address(raw_address), + # Rightmove's displayAddress is street-level; no UPRN/house number. + "UPRN": None, + "Property number or name": None, "Leasehold/Freehold": extract_tenure(prop.get("tenure")), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), diff --git a/finder/zoopla.py b/finder/zoopla.py index d36bc21..6cffe5f 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -32,16 +32,24 @@ import httpx from constants import ( DATA_DIR, DELAY_BETWEEN_PAGES, + GLUETUN_API_KEY, + GLUETUN_CONTROL_URL, + GLUETUN_MAX_ROTATIONS, + GLUETUN_PROXY, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE, + ZOOPLA_DETAIL_GOTO_TIMEOUT_MS, ) from spatial import PostcodeSpatialIndex from transform import ( - clean_listing_address, + build_register_address, extract_full_postcode, + extract_outcode, + fix_coords, normalize_sub_type, parse_int_value, + resolve_listing_postcode, validate_floor_area, ) @@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int: # cookies (bound to the previous IP), then reload and re-check the challenge. -_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K" - - def _gluetun_base_url() -> str: - return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/") + return GLUETUN_CONTROL_URL.rstrip("/") def _gluetun_api_key() -> str | None: - return _GLUETUN_API_KEY + return GLUETUN_API_KEY def _gluetun_max_rotations() -> int: - raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3") - try: - value = int(raw) - except ValueError as exc: - raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc - return max(value, 0) + return max(GLUETUN_MAX_ROTATIONS, 0) def _gluetun_client() -> httpx.Client: + # Talks to the control server directly (not through the VPN proxy). headers = {} api_key = _gluetun_api_key() if api_key: @@ -694,10 +695,19 @@ def launch_browser(): profile_dir.mkdir(parents=True, exist_ok=True) _remove_stale_profile_locks(profile_dir) + # Route the browser through the Gluetun VPN proxy when configured. (geoip + # fingerprint alignment is intentionally not enabled: it needs the optional + # camoufox[geoip] extra and would spoof to the VPN exit's country, which + # fights the en-GB locale unless the exit is in the UK.) + proxy_options: dict = {} + if GLUETUN_PROXY: + proxy_options = {"proxy": {"server": GLUETUN_PROXY}} + log.info( - "Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...", + "Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...", headless_mode, profile_dir, + GLUETUN_PROXY or "direct", ) camoufox = Camoufox( headless=headless_mode, @@ -705,6 +715,7 @@ def launch_browser(): user_data_dir=str(profile_dir), locale=["en-GB", "en"], enable_cache=True, + **proxy_options, ) raw_browser = camoufox.__enter__() browser = _ManagedCamoufoxBrowser(camoufox, raw_browser) @@ -926,13 +937,47 @@ def _paginate( page, total_results: int, max_properties: int | None = None, + fetch_detail=None, + detail_cap: int = 0, + detail_state: dict | None = None, + detail_deadline: float | None = None, ) -> list[dict]: """Extract listings from all pages of search results. Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered next link when present, otherwise advance via the pn=N URL parameter while - the advertised result count says more listings remain.""" + the advertised result count says more listings remain. + + When ``fetch_detail`` is supplied, each listing has its detail page fetched + (up to ``detail_cap`` fresh loads per outcode, counted in the shared + ``detail_state`` dict, and only until ``detail_deadline``) and the parsed + geo stored under ``listing['_detail']`` for ``transform_property``. The + detail page is the only source of the listing's UPRN, full street address + and precise postcode, so it is fetched even when the search card already + pins a full postcode. Cached detail results are always attached but cost + neither a cap slot nor a delay.""" + + def _maybe_fetch(listing: dict) -> None: + if fetch_detail is None or detail_state is None: + return + url = listing.get("url", "") + cached = _detail_cache_key(url) in _detail_cache + if not cached: + # Fresh loads are bounded by the per-outcode cap and the wall-clock + # deadline so detail fetching never starves the SIGALRM budget that + # also guards the search pagination for this outcode. + if detail_state["fetched"] >= detail_cap: + return + if detail_deadline is not None and time.monotonic() >= detail_deadline: + return + listing["_detail"] = fetch_detail(url) + if not cached: + detail_state["fetched"] += 1 + time.sleep(DELAY_BETWEEN_PAGES) + all_listings = _extract_listings(page) + for listing in all_listings: + _maybe_fetch(listing) if max_properties is not None and len(all_listings) >= max_properties: return all_listings[:max_properties] @@ -984,6 +1029,7 @@ def _paginate( if listing["id"] not in seen_ids: seen_ids.add(listing["id"]) all_listings.append(listing) + _maybe_fetch(listing) new_count += 1 if max_properties is not None and len(all_listings) >= max_properties: return all_listings[:max_properties] @@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None: return None +# --------------------------------------------------------------------------- +# Detail-page geocoding +# --------------------------------------------------------------------------- +# +# Zoopla search result cards only expose an outcode-level display address (e.g. +# "South Street, Bromley BR1"); the full postcode and precise coordinates exist +# only on each listing's detail page (/for-sale/details/{id}/). The detail page +# is a Next.js App Router route whose React Server Components flight stream +# embeds the property's own location object, e.g. +# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614}, +# "uprn":"10023461458","postalCode":"NR29 4RG",...} +# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":..., +# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets. +# Nearby points of interest (stations, schools, EV chargers) and comparable +# listings carry their own "coordinates" too, but never inside the property's +# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a +# loose coordinates object, is what we anchor on (see parse_detail_geo). + +# listingId -> parsed detail dict (or None). Failures are cached too, so a +# broken listing is not re-fetched within a run (the same listing reappears +# across overlapping outcode searches). +_detail_cache: dict[str, dict | None] = {} + +_LISTING_ID_RE = re.compile(r"/details/(\d+)/?") + +# The property's own location is carried by a `"location":{...}` wrapper and a +# twin `"address":{"fullAddress":...}` widget object. We anchor on those +# wrappers (and capture their full object body, which contains exactly one +# nested object — `coordinates`) rather than scanning for loose coordinate +# objects: nearby points of interest (stations/schools/EV chargers) and +# comparable/"similar" listings also embed coordinates, but never inside the +# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the +# wrapper is the discriminator. Field order and an optional `uprn` are tolerated. +_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}') +_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}') +_DETAIL_COORDS_IN_BODY_RE = re.compile( + r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}' +) +_DETAIL_LATLNG_IN_BODY_RE = re.compile( + r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)' +) +_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"') +# The location object spells it "postalCode"; the address twin uses "postcode". +_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"') +# The UPRN (Unique Property Reference Number) appears in both the location and +# address objects and is the linchpin for an exact listing->EPC join (EPC open +# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name +# (e.g. "12", "Martham Mill") only in the location object. +_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"') +_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"') + + +def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None: + """Extract the property's own coordinates/postcode from a Zoopla detail page. + + Pure and browser-free: the live browser only produces the HTML string + (``page.content()``); this does the parsing so it is unit-testable. + + Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn", + "number_or_name", "full_address"}`` (every field except the coordinates may + be ``None``) or ``None`` when no property location wrapper is found. The + ``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house + number/name) and ``full_address`` give a register-style address for the + Price Paid join. + Coordinates are bounds-checked to England and a postcode is kept only when + it agrees with its own object's outcode. ``search_outcode``, when given, is + used only as a tie-break to pick the right ``location`` object on pages that + also embed comparable listings. See module docstring for the data model.""" + if not html: + return None + + # RSC flight strings are embedded as escaped JS string literals, so quotes + # and slashes arrive escaped; normalize them so the regexes match. + buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/") + + def in_england(lat: float, lng: float) -> tuple[float, float] | None: + lat, lng = fix_coords(lat, lng) + if 49 <= lat <= 56 and -7 <= lng <= 2: + return lat, lng + return None + + def build(body: str, coords, source: str, full_address: str | None = None) -> dict: + # outcode and postcode are read from the SAME object body as the coords, + # so the postcode is self-consistent; drop it only if it somehow isn't. + outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body) + outcode = outcode_match.group(1) if outcode_match else None + postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body) + postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None + if postcode and outcode and extract_outcode(postcode) != outcode.upper(): + postcode = None + uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body) + number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body) + number_or_name = number_match.group(1).strip() if number_match else None + return { + "lat": coords[0], + "lng": coords[1], + "postcode": postcode, + "outcode": outcode, + "source": source, + "uprn": uprn_match.group(1) if uprn_match else None, + "number_or_name": number_or_name or None, + "full_address": full_address, + } + + def attach_full_address(result: dict | None) -> dict | None: + # The house-numbered street address lives in the `address` map-widget + # twin, not the `location` wrapper we anchor coordinates on. Pull it from + # the twin that shares this property's uprn; when there is no uprn to + # disambiguate, fall back to the first twin (document order = primary + # listing), but never guess a twin when a uprn exists and none matches — + # that would risk grabbing a comparable listing's address. + if result is None or result.get("full_address"): + return result + target = result.get("uprn") + first = None + for match in _DETAIL_ADDRESS_RE.finditer(buf): + full_address = match.group(1) or None + if full_address is None: + continue + if first is None: + first = full_address + uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2)) + if target and uprn_match and uprn_match.group(1) == target: + result["full_address"] = full_address + return result + if target is None: + result["full_address"] = first + return result + + # Strategy 1 — the property's own `location` wrapper (authoritative). Take + # the first match (the primary listing precedes any comparables in the + # flight stream), but prefer one whose outcode matches the searched outcode. + first_location = None + for match in _DETAIL_LOCATION_RE.finditer(buf): + body = match.group(1) + coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body) + if not coords_match: + continue + coords = in_england(float(coords_match.group(1)), float(coords_match.group(2))) + if not coords: + continue + candidate = build(body, coords, "detail_location") + if first_location is None: + first_location = candidate + if ( + search_outcode + and candidate["outcode"] + and candidate["outcode"].upper() == search_outcode.upper() + ): + return attach_full_address(candidate) + if first_location is not None: + return attach_full_address(first_location) + + # Strategy 2 — the `address` map-widget twin (same coordinates, backup). + for match in _DETAIL_ADDRESS_RE.finditer(buf): + full_address = match.group(1) or None + body = match.group(2) + latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body) + if not latlng_match: + continue + coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2))) + if coords: + return build(body, coords, "detail_address_obj", full_address=full_address) + + return None + + +def _detail_cache_key(listing_url: str) -> str: + """Cache key for a listing detail page — its numeric id when present.""" + id_match = _LISTING_ID_RE.search(listing_url) + return id_match.group(1) if id_match else listing_url + + +def _fetch_listing_detail( + detail_page, + listing_url: str, + search_outcode: str | None = None, +) -> dict | None: + """Load a listing detail page and return its parsed geo dict (or None). + + Results (including failures) are cached by listingId. Ordinary navigation + and extraction errors are swallowed so the caller can fall back to + outcode-level resolution, but TurnstileError is allowed to propagate so the + scraper's "Cloudflare ends the run" contract still holds. The goto timeout + is kept short so one slow detail page can't eat the per-outcode budget.""" + cache_key = _detail_cache_key(listing_url) + if cache_key in _detail_cache: + return _detail_cache[cache_key] + + url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url + result: dict | None = None + try: + detail_page.goto( + url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS + ) + _ensure_not_challenged(detail_page) + html = detail_page.content() + result = parse_detail_geo(html, search_outcode=search_outcode) + except TurnstileError: + raise + except Exception as exc: + log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc)) + result = None + + _detail_cache[cache_key] = result + return result + + def _map_property_type(raw_type: str | None) -> str: """Map Zoopla property type text to canonical type.""" if not raw_type: @@ -1109,28 +1363,64 @@ def transform_property( pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], search_outcode: str | None = None, + detail: dict | None = None, ) -> dict | None: """Transform a raw Zoopla listing dict into the standard output schema. - Zoopla search cards do not include coordinates, so we resolve lat/lng - from postcodes extracted from the address text.""" + Zoopla search cards only expose an outcode-level address, so precise + location comes from the listing's detail page (see ``parse_detail_geo`` / + ``_fetch_listing_detail``), passed in as ``detail``. When detail-page + coordinates are available we resolve the nearest postcode via the spatial + index — mirroring rightmove/onthemarket — and only fall back to the coarse + outcode centroid when no detail location could be obtained.""" price = parse_int_value(raw.get("price")) or 0 address = raw.get("address", "") or "" - # Resolve postcode and coordinates from address extracted_postcode = extract_full_postcode(address) - postcode = extracted_postcode - postcode_source = "address" if extracted_postcode else None + detail = detail or {} + detail_postcode = extract_full_postcode(detail.get("postcode")) + # Detail-page address fields: the UPRN keys an exact EPC join, and the + # full street address / house number-or-name beat the outcode-level card + # address for the Price-Paid join. All three are absent unless the detail + # page was fetched, so every consumer must tolerate None. + detail_uprn = detail.get("uprn") or None + detail_full_address = detail.get("full_address") or None + detail_number_or_name = detail.get("number_or_name") or None + + postcode = postcode_source = inferred_postcode = None lat = lng = None - if postcode: - coords = pc_coords.get(postcode) - if coords: - lat, lng = coords + # (A) Best: detail-page coordinates -> nearest postcode (authoritative). + detail_lat, detail_lng = detail.get("lat"), detail.get("lng") + if detail_lat is not None and detail_lng is not None: + fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng) + if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2: + nearest = pc_index.nearest(fixed_lat, fixed_lng) + if nearest: + lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest + candidate = detail_postcode or extracted_postcode + postcode, resolved_source = resolve_listing_postcode(candidate, nearest) + postcode_source = ( + "detail_address" + if resolved_source == "address" + else "detail_coordinates" + ) + # (B) Detail-page postcode without usable coordinates -> geocode it. + if lat is None and detail_postcode and detail_postcode in pc_coords: + lat, lng = pc_coords[detail_postcode] + postcode = inferred_postcode = detail_postcode + postcode_source = "detail_address" + + # (C) Full postcode in the search-card address -> geocode it. + if lat is None and extracted_postcode and extracted_postcode in pc_coords: + lat, lng = pc_coords[extracted_postcode] + postcode = extracted_postcode + postcode_source = "address" + + # (D) Last resort: coarse outcode-level centroid (loses per-listing precision). if lat is None: - # Try outcode-level fallback from address text addr_outcode = _extract_outcode(address) if addr_outcode: result = _resolve_outcode_coords(addr_outcode, pc_coords) @@ -1138,7 +1428,6 @@ def transform_property( postcode, lat, lng = result postcode_source = "address_outcode" - # Final fallback: use the outcode we know we're searching if lat is None and search_outcode: result = _resolve_outcode_coords(search_outcode, pc_coords) if result: @@ -1188,9 +1477,17 @@ def transform_property( "Postcode": postcode, "Postcode source": postcode_source or "unknown", "Extracted postcode": extracted_postcode, - "Inferred postcode": postcode if postcode_source != "address" else None, - "Listing raw address": address, - "Address per Property Register": clean_listing_address(address), + "Inferred postcode": ( + inferred_postcode + if inferred_postcode is not None + else (postcode if postcode_source != "address" else None) + ), + "Listing raw address": detail_full_address or address, + "Address per Property Register": build_register_address( + detail_full_address or address, detail_number_or_name + ), + "UPRN": detail_uprn, + "Property number or name": detail_number_or_name, "Leasehold/Freehold": raw.get("tenure") or None, "Property type": _map_property_type(raw.get("property_type")), "Property sub-type": normalize_sub_type(raw.get("property_type")), @@ -1215,6 +1512,9 @@ def search_outcode( pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], max_properties: int | None = None, + detail_page=None, + detail_cap: int = 0, + detail_budget_seconds: float | None = None, ) -> tuple[list[dict], str | None]: """Search Zoopla for properties in one outcode. @@ -1222,6 +1522,12 @@ def search_outcode( search flow, extracts listings from rendered DOM, and transforms to the standard output schema. + When ``detail_page`` (a second browser tab) and a positive ``detail_cap`` + are supplied, up to ``detail_cap`` listings per outcode have their detail + page fetched for a precise postcode (see ``_fetch_listing_detail``). + ``detail_budget_seconds`` caps the wall-clock time spent fetching details so + the per-outcode timeout that also guards search pagination is never starved. + Returns (properties, search_url). Raises TurnstileError if Cloudflare blocks us mid-session. @@ -1231,12 +1537,25 @@ def search_outcode( total_results = _get_result_count(page) + fetch_detail = None + detail_deadline = None + if detail_page is not None and detail_cap > 0: + fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731 + detail_page, url, search_outcode=outcode + ) + if detail_budget_seconds is not None: + detail_deadline = time.monotonic() + detail_budget_seconds + # Always try extraction even if result count is 0 — the count regex may # not match Zoopla's current text format, but listings may still be in DOM raw_listings = _paginate( page, total_results, max_properties=max_properties, + fetch_detail=fetch_detail, + detail_cap=detail_cap, + detail_state={"fetched": 0}, + detail_deadline=detail_deadline, ) if not raw_listings: if total_results > 0: @@ -1252,7 +1571,11 @@ def search_outcode( for raw in raw_listings: try: transformed = transform_property( - raw, pc_index, pc_coords, search_outcode=outcode + raw, + pc_index, + pc_coords, + search_outcode=outcode, + detail=raw.get("_detail"), ) except Exception as exc: log.warning( diff --git a/finder/zoopla_flaresolverr.py b/finder/zoopla_flaresolverr.py new file mode 100644 index 0000000..f3e6860 --- /dev/null +++ b/finder/zoopla_flaresolverr.py @@ -0,0 +1,164 @@ +"""Zoopla scraping via FlareSolverr (no browser/VNC needed). + +FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which +still contains the React Server Components flight stream — so the existing pure +parsers work unchanged: + - the search page yields the outcode's listing detail URLs, and + - each detail page's flight stream carries the property's location object + (postcode + coordinates) that ``parse_detail_geo`` extracts, plus the + listing fields (price/beds/baths/tenure/floor area) parsed here. + +Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm +FlareSolverr session solves the SW9 search + detail pages and the flight data +is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819). + +This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox +path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox". +""" + +import logging +import re +import time + +from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE +from flaresolverr import FlareSolverrError, FlareSolverrSession +from spatial import PostcodeSpatialIndex +from zoopla import _url_with_page, parse_detail_geo, transform_property + +log = logging.getLogger("zoopla") + +# Safety bound on how many search-result pages to walk per outcode. +_MAX_SERP_PAGES = 60 + +_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/") +_LISTING_ID_RE = re.compile(r"/details/(\d+)/") + + +def _int(pattern: str, buf: str) -> int | None: + match = re.search(pattern, buf) + return int(match.group(1)) if match else None + + +def parse_detail_listing(html: str) -> dict: + """Extract the non-location listing fields from a Zoopla detail page. + + Mirrors the fields the Camoufox SERP-card extractor produced, read from the + detail page's flight stream (validated against real Zoopla detail HTML). + All fields are best-effort; missing ones default to None so a listing with + a known location is still emitted.""" + buf = html.replace('\\"', '"').replace("\\/", "/") + + price = _int(r'"internalValue":(\d+)', buf) + if price is None: + price = _int(r'"priceUnformatted":(\d+)', buf) + + tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf) + tenure = tenure_match.group(1).title() if tenure_match else None + + # Address + property type come from the page , e.g. + # "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla" + address = None + property_type = None + title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf) + if title_match: + title = title_match.group(1) + addr_match = re.match(r"(.+?),\s*\d+\s*bed", title) + if addr_match: + address = addr_match.group(1).strip() + type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title) + if type_match: + property_type = type_match.group(1).strip() + explicit_type = re.search(r'"propertyType":"([^"]+)"', buf) + if explicit_type: + property_type = explicit_type.group(1) + + return { + "price": price, + "beds": _int(r'"numBedrooms":(\d+)', buf), + "baths": _int(r'"numBaths":(\d+)', buf), + "receptions": _int(r'"numLivingRooms":(\d+)', buf), + "floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf), + "tenure": tenure, + "property_type": property_type, + "address": address, + } + + +def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]: + """Walk the outcode's search-result pages and collect listing detail paths.""" + base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home" + seen: list[str] = [] + seen_ids: set[str] = set() + for page_num in range(1, _MAX_SERP_PAGES + 1): + url = base if page_num == 1 else _url_with_page(base, page_num) + html = fs.get(url) + new = 0 + for path in _DETAIL_PATH_RE.findall(html): + id_match = _LISTING_ID_RE.search(path) + listing_id = id_match.group(1) if id_match else path + if listing_id in seen_ids: + continue + seen_ids.add(listing_id) + seen.append(path) + new += 1 + if limit is not None and len(seen) >= limit: + return seen + if new == 0: + break + time.sleep(DELAY_BETWEEN_PAGES) + return seen + + +def search_outcode( + outcode: str, + pc_index: PostcodeSpatialIndex, + pc_coords: dict[str, tuple[float, float]], + fs: FlareSolverrSession, + max_properties: int | None = None, + detail_cap: int = 0, + detail_budget_seconds: float | None = None, +) -> tuple[list[dict], str | None]: + """Scrape one outcode via FlareSolverr. Returns (properties, search_url). + + Every listing's detail page is fetched (that is where the postcode lives), + so the effective listing count is bounded by both ``max_properties`` and + ``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details.""" + limit = detail_cap if detail_cap and detail_cap > 0 else None + if max_properties is not None: + limit = max_properties if limit is None else min(limit, max_properties) + + base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home" + paths = _enumerate_detail_paths(fs, outcode, limit) + if not paths: + return [], base + + deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None + properties: list[dict] = [] + dropped = 0 + for path in paths: + if deadline is not None and time.monotonic() >= deadline: + log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties)) + break + id_match = _LISTING_ID_RE.search(path) + listing_id = id_match.group(1) if id_match else path + try: + html = fs.get(ZOOPLA_BASE + path) + geo = parse_detail_geo(html, search_outcode=outcode) + raw = {"id": listing_id, "url": path, **parse_detail_listing(html)} + prop = transform_property( + raw, pc_index, pc_coords, search_outcode=outcode, detail=geo + ) + except FlareSolverrError as exc: + log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc) + prop = None + except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode + log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc) + prop = None + if prop: + properties.append(prop) + else: + dropped += 1 + time.sleep(DELAY_BETWEEN_PAGES) + + log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped) + return properties, base diff --git a/frontend/src/components/map/Map.tsx b/frontend/src/components/map/Map.tsx index 91a487f..f1b81dc 100644 --- a/frontend/src/components/map/Map.tsx +++ b/frontend/src/components/map/Map.tsx @@ -606,12 +606,13 @@ function OverlayTileLayers({ const showTrees = activeOverlays.has('trees-outside-woodlands'); const showPropertyBorders = activeOverlays.has('property-borders'); - // Restrict the heatmap to the selected crime types. When every type is - // selected we omit the filter entirely so all features contribute. - const crimeFilter = - activeCrimeTypes.size >= CRIME_TYPE_VALUES.length - ? undefined - : ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]]; + // Restrict the heatmap to the selected crime types. This must always be a + // concrete expression: passing `filter={undefined}` makes react-map-gl call + // map.addLayer({filter: undefined}), which MapLibre rejects at validation + // ("filter: array expected, undefined found"), so the layer is never created + // and the heatmap stays blank until a later setFilter call. An `in` over the + // selected types matches everything when all 14 are selected. + const crimeFilter = ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]]; return ( <> diff --git a/frontend/src/components/map/MobileDrawer.test.tsx b/frontend/src/components/map/MobileDrawer.test.tsx new file mode 100644 index 0000000..a7a626c --- /dev/null +++ b/frontend/src/components/map/MobileDrawer.test.tsx @@ -0,0 +1,107 @@ +import { cleanup, fireEvent, render, screen } from '@testing-library/react'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import MobileDrawer from './MobileDrawer'; + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string) => key, + }), +})); + +const originalSetPointerCapture = HTMLElement.prototype.setPointerCapture; + +function renderDrawer(onClose = vi.fn()) { + const view = render( + <MobileDrawer + onClose={onClose} + renderArea={() => <div>Area content</div>} + renderProperties={() => <div>Properties content</div>} + tab="area" + onTabChange={vi.fn()} + /> + ); + const handle = view.container.querySelector('[data-mobile-drawer-drag-handle]'); + const root = view.container.querySelector('[data-tutorial="right-pane"]'); + const panel = view.container.querySelector('[data-tutorial="right-pane"] > div:last-child'); + + if (!(handle instanceof HTMLElement)) throw new Error('Expected drawer drag handle'); + if (!(root instanceof HTMLElement)) throw new Error('Expected drawer root'); + if (!(panel instanceof HTMLElement)) throw new Error('Expected drawer panel'); + + return { ...view, handle, onClose, panel, root }; +} + +describe('MobileDrawer', () => { + beforeEach(() => { + HTMLElement.prototype.setPointerCapture = vi.fn(); + }); + + afterEach(() => { + cleanup(); + Object.defineProperty(HTMLElement.prototype, 'setPointerCapture', { + configurable: true, + value: originalSetPointerCapture, + }); + }); + + it('lowers and stays open when swiped down from the handle', () => { + const { handle, onClose, panel } = renderDrawer(); + + fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 }); + fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 }); + fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 }); + + expect(onClose).not.toHaveBeenCalled(); + expect(panel.style.transform).toBe('translateY(110px)'); + }); + + it('can be raised again after being lowered', () => { + const { handle, onClose, panel } = renderDrawer(); + + fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 }); + fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 }); + fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 }); + + fireEvent.pointerDown(handle, { pointerId: 2, clientY: 230 }); + fireEvent.pointerMove(handle, { pointerId: 2, clientY: 170 }); + fireEvent.pointerUp(handle, { pointerId: 2, clientY: 170 }); + + expect(onClose).not.toHaveBeenCalled(); + expect(panel.style.transform).toBe('translateY(50px)'); + }); + + it('keeps the close control reachable when dragged down far', () => { + const { handle, panel } = renderDrawer(); + + Object.defineProperty(panel, 'offsetHeight', { + configurable: true, + value: 200, + }); + + fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 }); + fireEvent.pointerMove(handle, { pointerId: 1, clientY: 420 }); + fireEvent.pointerUp(handle, { pointerId: 1, clientY: 420 }); + + expect(panel.style.transform).toBe('translateY(96px)'); + }); + + it('leaves the rest of the mobile map usable while the panel is open', () => { + const { panel, root } = renderDrawer(); + const spacer = root.firstElementChild; + + if (!(spacer instanceof HTMLElement)) throw new Error('Expected drawer spacer'); + + expect(root.className).toContain('pointer-events-none'); + expect(panel.className).toContain('pointer-events-auto'); + expect(spacer.className).not.toContain('bg-black'); + }); + + it('closes from the close button', () => { + const { onClose } = renderDrawer(); + + fireEvent.click(screen.getByLabelText('mobileDrawer.closeDrawer')); + + expect(onClose).toHaveBeenCalledTimes(1); + }); +}); diff --git a/frontend/src/lib/color-opacity.ts b/frontend/src/lib/color-opacity.ts new file mode 100644 index 0000000..bac9ebf --- /dev/null +++ b/frontend/src/lib/color-opacity.ts @@ -0,0 +1,11 @@ +export const DEFAULT_COLOR_OPACITY = 1; +export const MIN_COLOR_OPACITY = 0.1; + +export function normalizeColorOpacity(value: number | null | undefined): number { + if (value == null || !Number.isFinite(value)) return DEFAULT_COLOR_OPACITY; + return Math.min(1, Math.max(MIN_COLOR_OPACITY, value)); +} + +export function colorOpacityToPercent(value: number): number { + return Math.round(normalizeColorOpacity(value) * 100); +} diff --git a/frontend/src/lib/crime-types.ts b/frontend/src/lib/crime-types.ts new file mode 100644 index 0000000..ff1f8d9 --- /dev/null +++ b/frontend/src/lib/crime-types.ts @@ -0,0 +1,35 @@ +// Street-crime categories carried by the `crime_hotspots` vector tiles in the +// `crime_type` feature property. The `value` strings must match the police.uk +// "Crime type" values exactly (see pipeline/transform/crime_hotspot_tiles.py), +// because they are used directly in the MapLibre heatmap `filter` expression. +// `label` is a shorter, human-friendly name for the overlay-selector checkboxes. + +export interface CrimeTypeDef { + value: string; + label: string; +} + +export const CRIME_TYPES: readonly CrimeTypeDef[] = [ + { value: 'Violence and sexual offences', label: 'Violence & sexual offences' }, + { value: 'Anti-social behaviour', label: 'Anti-social behaviour' }, + { value: 'Criminal damage and arson', label: 'Criminal damage & arson' }, + { value: 'Public order', label: 'Public order' }, + { value: 'Shoplifting', label: 'Shoplifting' }, + { value: 'Vehicle crime', label: 'Vehicle crime' }, + { value: 'Burglary', label: 'Burglary' }, + { value: 'Other theft', label: 'Other theft' }, + { value: 'Theft from the person', label: 'Theft from the person' }, + { value: 'Bicycle theft', label: 'Bicycle theft' }, + { value: 'Drugs', label: 'Drugs' }, + { value: 'Robbery', label: 'Robbery' }, + { value: 'Possession of weapons', label: 'Possession of weapons' }, + { value: 'Other crime', label: 'Other crime' }, +] as const; + +export const CRIME_TYPE_VALUES: readonly string[] = CRIME_TYPES.map((c) => c.value); + +const CRIME_TYPE_VALUE_SET = new Set<string>(CRIME_TYPE_VALUES); + +export function isCrimeTypeValue(value: string): boolean { + return CRIME_TYPE_VALUE_SET.has(value); +} diff --git a/pipeline/download/inspire.py b/pipeline/download/inspire.py index c32e531..64981e2 100644 --- a/pipeline/download/inspire.py +++ b/pipeline/download/inspire.py @@ -4,7 +4,10 @@ Downloads GML files for all local authorities from the INSPIRE download page. Each ZIP contains a GML file with title extent polygons for that authority. Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download -License: INSPIRE End User Licence +License: Open Government Licence v3.0 (since 1 July 2020, under the PSGA). + Requires HM Land Registry + Ordnance Survey (AC0000851063) attribution; see + the conditions page at the source URL. Boundaries are indicative "general + boundaries", not the legal extent of title. """ import argparse diff --git a/pipeline/download/satellite_highres.py b/pipeline/download/satellite_highres.py new file mode 100644 index 0000000..06141df --- /dev/null +++ b/pipeline/download/satellite_highres.py @@ -0,0 +1,505 @@ +"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography. + +The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open +(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the +British National Grid. There is no public imagery tile service, so we mirror the +Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for +an area of interest, pick the best RGB capture per OS tile, download and decode the +ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles +archive that the server stacks *over* the Sentinel-2 base where coverage exists. + +ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2 +SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore +runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the +rest of the pipeline is plain Python plus the ``pmtiles`` CLI. +""" + +from __future__ import annotations + +import argparse +import json +import re +import shutil +import sqlite3 +import subprocess +import tempfile +import urllib.error +import urllib.request +import zipfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path + +from pipeline.download.tiles import ensure_pmtiles_cli +from pipeline.local_temp import local_tmp_dir + +# Defra Data Services Platform survey download API (reverse-engineered from the +# environment.data.gov.uk/survey front-end; no official API is documented). +SEARCH_URL = ( + "https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search" +) +SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey" +# Static public key baked into the survey page JS. May rotate -- we try to scrape a +# fresh one from the page and only fall back to this literal. +DEFAULT_SUBSCRIPTION_KEY = "dspui" +SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)") + +# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants). +VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb" + +# Greater London bounding box (lon/lat). The API only returns tiles where coverage +# exists, so a generous bbox is fine -- it does not force blank downloads. +DEFAULT_AOI: dict = { + "type": "Polygon", + "coordinates": [ + [ + [-0.55, 51.25], + [0.30, 51.25], + [0.30, 51.70], + [-0.55, 51.70], + [-0.55, 51.25], + ] + ], +} + +DEFAULT_MIN_ZOOM = 14 +DEFAULT_MAX_ZOOM = 19 +# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so +# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile. +DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest" +USER_AGENT = "perfect-postcode-satellite-highres/1.0" +ATTRIBUTION_TEMPLATE = ( + "Environment Agency Vertical Aerial Photography - " + "© Environment Agency copyright and/or database right {year}. " + "All rights reserved. Licensed under the Open Government Licence v3.0." +) + + +@dataclass(frozen=True) +class VapTile: + """One survey download record from the Defra search API.""" + + product_id: str + year: int + resolution_m: float + os_tile_id: str + uri: str + label: str + + +def parse_search_results(payload: dict) -> list[VapTile]: + """Turn a raw search-API JSON payload into typed records.""" + tiles: list[VapTile] = [] + for result in payload.get("results", []): + try: + tiles.append( + VapTile( + product_id=result["product"]["id"], + year=int(result["year"]["id"]), + resolution_m=float(result["resolution"]["id"]), + os_tile_id=result["tile"]["id"], + uri=result["uri"], + label=result.get("label", ""), + ) + ) + except (KeyError, TypeError, ValueError): + # Skip malformed records rather than failing the whole search. + continue + return tiles + + +def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]: + """Pick one RGB capture per OS tile: finest resolution, then latest year. + + Pure function -- the unit test exercises this against a real-shaped payload. + """ + best: dict[str, VapTile] = {} + for tile in tiles: + if tile.product_id != VAP_RGB_PRODUCT: + continue + current = best.get(tile.os_tile_id) + if current is None or _is_better(tile, current): + best[tile.os_tile_id] = tile + return [best[key] for key in sorted(best)] + + +def _is_better(candidate: VapTile, incumbent: VapTile) -> bool: + """Finer resolution wins; ties broken by the most recent survey year.""" + if candidate.resolution_m != incumbent.resolution_m: + return candidate.resolution_m < incumbent.resolution_m + return candidate.year > incumbent.year + + +def _http_get(url: str, timeout: float) -> bytes: + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as response: + return response.read() + + +def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str: + """Use an explicit key, else scrape the survey page JS, else the known default.""" + if explicit: + return explicit + try: + page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore") + match = SUBSCRIPTION_KEY_RE.search(page) + if match: + return match.group(1) + # The key usually lives in a referenced JS chunk; scan the largest one. + for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page): + js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout) + match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore")) + if match: + return match.group(1) + except (urllib.error.URLError, TimeoutError, ConnectionError) as err: + print(f"Could not scrape subscription key ({err}); using default", flush=True) + return DEFAULT_SUBSCRIPTION_KEY + + +def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]: + """POST the area-of-interest polygon and return the RGB tiles to download.""" + body = json.dumps(aoi).encode("utf-8") + req = urllib.request.Request( + SEARCH_URL, + data=body, + headers={ + "Content-Type": "application/geo+json", + "Referer": SURVEY_PAGE_URL, + "User-Agent": USER_AGENT, + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=timeout) as response: + payload = json.load(response) + selected = select_best_rgb_tiles(parse_search_results(payload)) + print( + f"Search returned {payload.get('count', 0)} records; " + f"selected {len(selected)} RGB tile(s)", + flush=True, + ) + return selected + + +def _download_and_extract( + tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int +) -> list[Path]: + """Download one survey zip and extract its ECW raster(s).""" + url = f"{tile.uri}?subscription-key={key}" + zip_path = ecw_dir / f"{tile.os_tile_id}.zip" + last_error: Exception | None = None + for attempt in range(retries + 1): + try: + with urllib.request.urlopen( + urllib.request.Request(url, headers={"User-Agent": USER_AGENT}), + timeout=timeout, + ) as response, zip_path.open("wb") as out: + shutil.copyfileobj(response, out, length=1 << 20) + break + except (urllib.error.URLError, TimeoutError, ConnectionError) as err: + last_error = err + if attempt == retries: + raise RuntimeError(f"Failed to download {url}: {err}") from err + extracted: list[Path] = [] + with zipfile.ZipFile(zip_path) as archive: + for member in archive.infolist(): + if member.is_dir() or not member.filename.lower().endswith(".ecw"): + continue + target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}" + with archive.open(member) as src, target.open("wb") as dst: + shutil.copyfileobj(src, dst, length=1 << 20) + extracted.append(target) + zip_path.unlink(missing_ok=True) + if not extracted: + print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True) + return extracted + + +def download_tiles( + tiles: list[VapTile], + ecw_dir: Path, + key: str, + max_workers: int, + timeout: float, + retries: int, +) -> list[Path]: + """Download every selected tile concurrently; return all extracted ECW paths.""" + ecw_dir.mkdir(parents=True, exist_ok=True) + ecw_paths: list[Path] = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + _download_and_extract, tile, ecw_dir, key, timeout, retries + ): tile + for tile in tiles + } + done = 0 + for future in as_completed(futures): + tile = futures[future] + ecw_paths.extend(future.result()) + done += 1 + print( + f"Downloaded {done}/{len(tiles)} tiles " + f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})", + flush=True, + ) + return ecw_paths + + +def _build_tiles_with_gdal( + work_dir: Path, + gdal_image: str, + min_zoom: int, + max_zoom: int, + jobs: int, + webp_quality: int, +) -> Path: + """Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image. + + Returns the host path of the generated ``xyz`` directory. We use lossy WebP with + an alpha channel: ~6x smaller than lossless PNG for photographic imagery while + keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base + shows through them. + """ + xyz_dir = work_dir / "xyz" + # EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask), + # so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and + # exceed PNG's 4-band limit). We then: + # * force EPSG:27700 -- the pixels are already British National Grid, and the + # EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image) + # for metre-accurate reprojection to Web Mercator; + # * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block + # gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the + # Sentinel-2 base show through wherever VAP coverage is missing. + script = ( + "set -euo pipefail; " + "cd /work; " + "gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; " + "gdal_edit.py -a_srs EPSG:27700 " + "-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue " + "-colorinterp_4 alpha mosaic.vrt; " + f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} " + f"--processes={jobs} --resampling=average --webviewer=none " + f"--tiledriver=WEBP --webp-quality={webp_quality} " + "mosaic.vrt xyz" + ) + subprocess.run( + [ + "docker", + "run", + "--rm", + "-v", + f"{work_dir.resolve()}:/work", + gdal_image, + "bash", + "-c", + script, + ], + check=True, + ) + if not xyz_dir.exists(): + raise RuntimeError("gdal2tiles produced no output directory") + return xyz_dir + + +def _pack_xyz_to_mbtiles( + xyz_dir: Path, + mbtiles_path: Path, + bounds: tuple[float, float, float, float], + min_zoom: int, + max_zoom: int, + attribution: str, +) -> int: + """Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows).""" + if mbtiles_path.exists(): + mbtiles_path.unlink() + conn = sqlite3.connect(mbtiles_path) + try: + conn.execute("PRAGMA journal_mode = WAL") + conn.execute("PRAGMA synchronous = NORMAL") + conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)") + conn.execute( + "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, " + "tile_row INTEGER, tile_data BLOB)" + ) + conn.execute( + "CREATE UNIQUE INDEX tile_index ON tiles " + "(zoom_level, tile_column, tile_row)" + ) + conn.executemany( + "INSERT INTO metadata (name, value) VALUES (?, ?)", + [ + ("name", "EA Vertical Aerial Photography"), + ("type", "overlay"), + ("version", "1"), + ("description", "Environment Agency high-resolution aerial imagery"), + ("format", "webp"), + ("attribution", attribution), + ("bounds", ",".join(f"{value:.6f}" for value in bounds)), + ("minzoom", str(min_zoom)), + ("maxzoom", str(max_zoom)), + ], + ) + inserted = 0 + for zoom_dir in sorted(xyz_dir.iterdir()): + if not zoom_dir.is_dir() or not zoom_dir.name.isdigit(): + continue + zoom = int(zoom_dir.name) + for col_dir in zoom_dir.iterdir(): + if not col_dir.is_dir() or not col_dir.name.isdigit(): + continue + col = int(col_dir.name) + for tile_file in col_dir.glob("*.webp"): + if not tile_file.stem.isdigit(): + continue + row = int(tile_file.stem) + tms_row = (1 << zoom) - 1 - row + conn.execute( + "INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)", + (zoom, col, tms_row, tile_file.read_bytes()), + ) + inserted += 1 + if inserted % 5000 == 0: + conn.commit() + print(f" packed {inserted:,} tiles", flush=True) + conn.commit() + finally: + conn.close() + return inserted + + +def build_satellite_highres_tiles( + output_path: Path, + pmtiles_bin: Path, + pmtiles_version: str, + aoi: dict, + min_zoom: int, + max_zoom: int, + gdal_image: str, + subscription_key: str | None, + max_workers: int, + timeout: float, + retries: int, + jobs: int, + webp_quality: int, +) -> None: + if min_zoom > max_zoom: + raise ValueError("--min-zoom must be <= --max-zoom") + + output_path.parent.mkdir(parents=True, exist_ok=True) + ensure_pmtiles_cli(pmtiles_bin, pmtiles_version) + + tiles = search_vap_tiles(aoi) + if not tiles: + raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI") + key = resolve_subscription_key(subscription_key) + attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles)) + + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: + work_dir = Path(tmp) + ecw_dir = work_dir / "ecw" + ecw_paths = download_tiles( + tiles, ecw_dir, key, max_workers, timeout, retries + ) + if not ecw_paths: + raise RuntimeError("No ECW rasters were extracted from the downloads") + + xyz_dir = _build_tiles_with_gdal( + work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality + ) + + mbtiles_path = work_dir / "satellite_highres.mbtiles" + bounds = _aoi_bounds(aoi) + inserted = _pack_xyz_to_mbtiles( + xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution + ) + if inserted == 0: + raise RuntimeError("Tiling produced no tiles to pack") + print(f"Packed {inserted:,} tiles into MBTiles", flush=True) + + subprocess.run( + [str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"], + check=True, + ) + + size_mb = output_path.stat().st_size / (1024 * 1024) + print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True) + + +def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]: + coords = [point for ring in aoi["coordinates"] for point in ring] + lons = [point[0] for point in coords] + lats = [point[1] for point in coords] + return min(lons), min(lats), max(lons), max(lats) + + +def _load_aoi(path: Path | None) -> dict: + if path is None: + return DEFAULT_AOI + data = json.loads(path.read_text()) + if data.get("type") == "FeatureCollection": + return data["features"][0]["geometry"] + if data.get("type") == "Feature": + return data["geometry"] + return data + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--output", type=Path, required=True) + parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")) + parser.add_argument("--pmtiles-version", default="1.22.3") + parser.add_argument( + "--aoi-geojson", + type=Path, + default=None, + help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest " + "(default: Greater London)", + ) + parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM) + parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM) + parser.add_argument( + "--gdal-image", + default=DEFAULT_GDAL_IMAGE, + help="Docker image with a GDAL that has the ECW driver", + ) + parser.add_argument( + "--subscription-key", + default=None, + help="Override the Defra survey API key (default: scrape, then 'dspui')", + ) + parser.add_argument("--max-workers", type=int, default=4) + parser.add_argument("--timeout", type=float, default=600.0) + parser.add_argument("--retries", type=int, default=3) + parser.add_argument( + "--jobs", + type=int, + default=8, + help="Parallel processes for gdal2tiles", + ) + parser.add_argument( + "--webp-quality", + type=int, + default=85, + help="WebP tile quality (1-100); lower is smaller", + ) + args = parser.parse_args() + + build_satellite_highres_tiles( + output_path=args.output, + pmtiles_bin=args.pmtiles_bin, + pmtiles_version=args.pmtiles_version, + aoi=_load_aoi(args.aoi_geojson), + min_zoom=args.min_zoom, + max_zoom=args.max_zoom, + gdal_image=args.gdal_image, + subscription_key=args.subscription_key, + max_workers=max(1, args.max_workers), + timeout=args.timeout, + retries=max(0, args.retries), + jobs=max(1, args.jobs), + webp_quality=args.webp_quality, + ) + + +if __name__ == "__main__": + main() diff --git a/pipeline/download/test_satellite_highres.py b/pipeline/download/test_satellite_highres.py new file mode 100644 index 0000000..dc84fa3 --- /dev/null +++ b/pipeline/download/test_satellite_highres.py @@ -0,0 +1,97 @@ +from pipeline.download import satellite_highres +from pipeline.download.satellite_highres import ( + VapTile, + parse_search_results, + select_best_rgb_tiles, +) + + +def _result(product: str, year: str, resolution: str, tile: str) -> dict: + """One search-API record in the real response shape.""" + return { + "product": {"id": product, "label": product}, + "year": {"id": year, "label": year}, + "resolution": {"id": resolution, "label": f"{resolution}m"}, + "tile": {"id": tile, "label": tile}, + "label": f"{product}-{year}-{resolution}m-{tile}", + "uri": ( + "https://environment.data.gov.uk/tiles/collections/survey/" + f"{product}/{year}/{resolution}/{tile}" + ), + } + + +# Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011), +# plus Night Time and LIDAR products that must be ignored. +SAMPLE_PAYLOAD = { + "count": 6, + "results": [ + _result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"), + _result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"), + _result("lidar_composite_dtm", "2022", "1", "TQ2575"), + # TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer. + _result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"), + _result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"), + _result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"), + ], +} + + +def test_parse_search_results_skips_malformed_records() -> None: + payload = { + "results": [ + _result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"), + {"product": {"id": "broken"}}, # missing year/resolution/tile/uri + ] + } + tiles = parse_search_results(payload) + assert len(tiles) == 1 + assert tiles[0] == VapTile( + product_id="vertical_aerial_photography_tiles_rgb", + year=2008, + resolution_m=0.4, + os_tile_id="TQ2575", + uri="https://environment.data.gov.uk/tiles/collections/survey/" + "vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575", + label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575", + ) + + +def test_select_best_rgb_filters_non_rgb_products() -> None: + selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD)) + assert {tile.product_id for tile in selected} == { + satellite_highres.VAP_RGB_PRODUCT + } + + +def test_select_best_rgb_one_tile_per_os_square() -> None: + selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD)) + assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"] + + +def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None: + selected = { + tile.os_tile_id: tile + for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD)) + } + # TQ2575: only one RGB capture. + assert selected["TQ2575"].resolution_m == 0.4 + # TQ3080: finest resolution (0.1m) wins even though it is the older survey. + assert selected["TQ3080"].resolution_m == 0.1 + assert selected["TQ3080"].year == 2008 + + +def test_select_best_rgb_breaks_resolution_ties_by_year() -> None: + tiles = [ + VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"), + VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"), + VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"), + ] + selected = select_best_rgb_tiles(tiles) + assert len(selected) == 1 + assert selected[0].year == 2018 + + +def test_select_best_rgb_empty_when_no_rgb() -> None: + payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]} + assert select_best_rgb_tiles(parse_search_results(payload)) == [] diff --git a/pipeline/test_validate_outputs.py b/pipeline/test_validate_outputs.py index e61b9f9..4b7d4ce 100644 --- a/pipeline/test_validate_outputs.py +++ b/pipeline/test_validate_outputs.py @@ -1,12 +1,25 @@ from __future__ import annotations import zipfile +import json import polars as pl from pipeline.validate_outputs import main +def write_boundary(path, postcodes): + units = path / "units" + units.mkdir(parents=True) + features = [ + {"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None} + for postcode in postcodes + ] + (units / "AA1.geojson").write_text( + json.dumps({"type": "FeatureCollection", "features": features}) + ) + + def test_validates_parquet_file_and_zip(tmp_path, monkeypatch): parquet_path = tmp_path / "data.parquet" file_path = tmp_path / "plain.txt" @@ -59,3 +72,42 @@ def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys): assert "empty file" in stderr assert "missing" in stderr assert "no files matched" in stderr + + +def test_validates_postcode_boundary_matches(tmp_path, monkeypatch): + postcodes_path = tmp_path / "postcodes.parquet" + boundaries_path = tmp_path / "postcode_boundaries" + pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) + write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"]) + + monkeypatch.setattr( + "sys.argv", + [ + "validate_outputs", + "--postcode-boundary-match", + f"{postcodes_path}::{boundaries_path}", + ], + ) + + assert main() == 0 + + +def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys): + postcodes_path = tmp_path / "postcodes.parquet" + boundaries_path = tmp_path / "postcode_boundaries" + pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) + write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"]) + + monkeypatch.setattr( + "sys.argv", + [ + "validate_outputs", + "--postcode-boundary-match", + f"{postcodes_path}::{boundaries_path}", + ], + ) + + assert main() == 1 + stderr = capsys.readouterr().err + assert "missing boundaries" in stderr + assert "boundary postcodes are absent" in stderr diff --git a/pipeline/transform/crime_spatial.py b/pipeline/transform/crime_spatial.py new file mode 100644 index 0000000..c72a6b5 --- /dev/null +++ b/pipeline/transform/crime_spatial.py @@ -0,0 +1,358 @@ +"""Aggregate police.uk street crime to postcodes by 50m spatial proximity. + +Instead of attributing each incident to its published LSOA code, this transform +counts the anonymised incident *points* that fall within 50m of each postcode's +boundary polygon (the polygon buffered outward by 50m). A point inside several +overlapping buffers counts for each postcode -- the same multiplicity the +tree-density filter uses for features near more than one postcode. + +The metric is a raw annualised count ("incidents/year within 50m"); there is no +per-capita denominator. Outputs mirror the old LSOA transform's shape but are +keyed on ``postcode`` instead of ``LSOA code``: + +* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns. +* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"`` + nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups. + +Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map +points", not true locations, and a share of rows have no coordinate at all +(dropped here). Spatial totals are therefore lower than, and fuzzier than, the +old LSOA-tagged counts -- by design, not a regression. +""" + +from __future__ import annotations + +import argparse +import re +from pathlib import Path + +import numpy as np +import polars as pl +import shapely +from pyproj import Transformer + +from pipeline.transform.crime import ( + MINOR_CRIME_TYPES, + SERIOUS_CRIME_TYPES, + find_street_crime_csvs, +) +from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons + +# Serious types first so column order is stable and self-documenting. +ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES + +DEFAULT_BUFFER_M = 50.0 +MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$") + +# Generous GB bounds; points outside fall in no English postcode anyway, but +# filtering first keeps the WGS84->BNG transform out of its undefined region. +LON_BOUNDS = (-9.5, 2.5) +LAT_BOUNDS = (49.0, 61.5) + +# Read CSVs in chunks of files to bound peak memory while keeping the STRtree +# query vectorised over a useful number of points. +_CSV_BATCH = 64 + + +def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]: + """Derive annualisation denominators from the monthly directory names. + + Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that + month's incidents, so the set of month directories is the set of observed + months. Returns the sorted distinct years, months-observed-per-year, and the + total month count (the avg/yr denominator). + """ + months = sorted( + {path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)} + ) + if not months: + raise ValueError("No valid YYYY-MM month directories found among crime CSVs") + + months_in_year: dict[int, int] = {} + for month in months: + year = int(month[:4]) + months_in_year[year] = months_in_year.get(year, 0) + 1 + + years = sorted(months_in_year) + return years, months_in_year, len(months) + + +def _build_tree( + polygons: np.ndarray, buffer_m: float +) -> tuple[np.ndarray, shapely.STRtree]: + """Buffer postcode polygons outward by ``buffer_m`` and index them. + + Buffer index == postcode index. Geometries that fail to buffer are replaced + with an empty polygon so the index stays aligned; they simply never match. + """ + buffers = shapely.buffer(polygons, buffer_m, quad_segs=8) + broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers) + if broken.any(): + print(f" {int(broken.sum()):,} postcode buffers unusable; left empty") + buffers[broken] = shapely.from_wkt("POLYGON EMPTY") + return buffers, shapely.STRtree(buffers) + + +def _accumulate_counts( + csvs: list[Path], + tree: shapely.STRtree, + type_to_idx: dict[str, int], + year_to_idx: dict[int, int], + transformer: Transformer, + counts: np.ndarray, +) -> None: + """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year).""" + schema = { + "Longitude": pl.Float64, + "Latitude": pl.Float64, + "Month": pl.Utf8, + "Crime type": pl.Utf8, + } + known_types = list(type_to_idx) + total_points = 0 + total_matches = 0 + total_dropped = 0 + + for start in range(0, len(csvs), _CSV_BATCH): + batch = csvs[start : start + _CSV_BATCH] + frame = ( + pl.scan_csv( + batch, + schema_overrides=schema, + ignore_errors=True, + ) + .select("Longitude", "Latitude", "Month", "Crime type") + .with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year")) + .filter( + pl.col("Longitude").is_not_null() + & pl.col("Latitude").is_not_null() + & pl.col("Longitude").is_between(*LON_BOUNDS) + & pl.col("Latitude").is_between(*LAT_BOUNDS) + & pl.col("Crime type").is_in(known_types) + & pl.col("year").is_in(list(year_to_idx)) + ) + .with_columns( + pl.col("Crime type") + .replace_strict(type_to_idx, return_dtype=pl.Int32) + .alias("tidx"), + pl.col("year") + .replace_strict(year_to_idx, return_dtype=pl.Int32) + .alias("yidx"), + ) + .select("Longitude", "Latitude", "tidx", "yidx") + .collect(engine="streaming") + ) + + rows_in = frame.height + if rows_in == 0: + continue + + lon = frame["Longitude"].to_numpy() + lat = frame["Latitude"].to_numpy() + tidx = frame["tidx"].to_numpy() + yidx = frame["yidx"].to_numpy() + + x, y = transformer.transform(lon, lat) + finite = np.isfinite(x) & np.isfinite(y) + total_dropped += int((~finite).sum()) + if not finite.any(): + continue + x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite] + total_points += x.size + + points = shapely.points(x, y) + point_index, postcode_index = tree.query(points, predicate="intersects") + if point_index.size: + np.add.at( + counts, + (postcode_index, tidx[point_index], yidx[point_index]), + 1, + ) + total_matches += point_index.size + + print( + f" files {start + len(batch):,}/{len(csvs):,}: " + f"{total_points:,} located points, {total_matches:,} postcode matches" + ) + + if total_dropped: + print(f"Dropped {total_dropped:,} points outside the BNG transform domain") + + +def _rollup_long( + long: pl.DataFrame, types: tuple[str, ...], rollup_name: str +) -> pl.DataFrame: + """Sum per-year annualised counts across ``types`` into a single rollup.""" + return ( + long.filter(pl.col("Crime type").is_in(list(types))) + .group_by("postcode", "year") + .agg(pl.col("count").sum().round(1).alias("count")) + .with_columns(pl.lit(rollup_name).alias("Crime type")) + .select("postcode", "Crime type", "year", "count") + ) + + +def _write_avg_yr( + postcodes: np.ndarray, + counts: np.ndarray, + valid_month_count: int, + output_path: Path, +) -> None: + """Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals.""" + totals = counts.sum(axis=2) # (n_postcodes, n_types) + avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32) + + data: dict[str, np.ndarray] = {"postcode": postcodes} + for type_idx, name in enumerate(ALL_CRIME_TYPES): + data[f"{name} (avg/yr)"] = avg[:, type_idx] + + output_path.parent.mkdir(parents=True, exist_ok=True) + pl.DataFrame(data).write_parquet(output_path, compression="zstd") + print(f"Wrote postcode crime averages: {output_path}") + + +def _write_by_year( + postcodes: np.ndarray, + counts: np.ndarray, + years: list[int], + months_in_year: dict[int, int], + output_path: Path, +) -> None: + """Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups.""" + months = np.array([months_in_year[year] for year in years], dtype=np.float64) + annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1) + + pc_i, ty_i, yr_i = np.nonzero(counts) + if pc_i.size == 0: + raise ValueError("No crime points matched any postcode buffer") + + type_names = np.array(ALL_CRIME_TYPES, dtype=object) + year_values = np.array(years, dtype=np.int32) + long = pl.DataFrame( + { + "postcode": postcodes[pc_i], + "Crime type": type_names[ty_i], + "year": year_values[yr_i], + "count": annual[pc_i, ty_i, yr_i].astype(np.float32), + } + ) + + serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime") + minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime") + combined = pl.concat([long, serious, minor]) + + by_type = ( + combined.sort("year") + .group_by("postcode", "Crime type") + .agg(pl.struct("year", "count").alias("series")) + ) + wide = by_type.pivot(on="Crime type", index="postcode", values="series") + type_cols = [c for c in wide.columns if c != "postcode"] + wide = wide.rename({col: f"{col} (by year)" for col in type_cols}) + + output_path.parent.mkdir(parents=True, exist_ok=True) + wide.write_parquet(output_path, compression="zstd") + print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}") + + +def transform_crime_spatial( + crime_dir: Path, + boundaries_dir: Path, + output_path: Path, + by_year_output_path: Path, + buffer_m: float = DEFAULT_BUFFER_M, + max_postcodes: int | None = None, + max_files: int | None = None, +) -> None: + csvs, ignored_csv_count = find_street_crime_csvs(crime_dir) + if not csvs: + raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}") + if max_files is not None: + csvs = csvs[:max_files] + + years, months_in_year, valid_month_count = _month_calendar(csvs) + print( + f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months " + f"({years[0]}-{years[-1]})" + + (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "") + ) + + postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes) + print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...") + _buffers, tree = _build_tree(polygons, buffer_m) + + type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)} + year_to_idx = {year: idx for idx, year in enumerate(years)} + counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32) + + transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) + _accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts) + + _write_avg_yr(postcodes, counts, valid_month_count, output_path) + _write_by_year(postcodes, counts, years, months_in_year, by_year_output_path) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Count police.uk crime points within 50m of each postcode boundary" + ) + parser.add_argument( + "--input", + type=Path, + default=Path("property-data/crime"), + help="Directory containing police.uk street crime CSVs", + ) + parser.add_argument( + "--boundaries", + type=Path, + default=Path("property-data/postcode_boundaries/units"), + help="Directory of per-district postcode boundary GeoJSONs", + ) + parser.add_argument( + "--output", + type=Path, + required=True, + help="Output parquet: postcode + '{type} (avg/yr)' columns", + ) + parser.add_argument( + "--output-by-year", + type=Path, + required=True, + help="Output parquet: postcode + nested '{type} (by year)' columns", + ) + parser.add_argument( + "--buffer-m", + type=float, + default=DEFAULT_BUFFER_M, + help="Outward buffer (metres) added to each postcode boundary", + ) + parser.add_argument( + "--max-postcodes", + type=int, + default=None, + help="Testing only: process the first N postcodes", + ) + parser.add_argument( + "--max-files", + type=int, + default=None, + help="Testing only: process the first N monthly CSV files", + ) + args = parser.parse_args() + + if args.buffer_m <= 0: + raise SystemExit("--buffer-m must be greater than zero") + + transform_crime_spatial( + crime_dir=args.input, + boundaries_dir=args.boundaries, + output_path=args.output, + by_year_output_path=args.output_by_year, + buffer_m=args.buffer_m, + max_postcodes=args.max_postcodes, + max_files=args.max_files, + ) + + +if __name__ == "__main__": + main() diff --git a/pipeline/transform/join_epc_pp.py b/pipeline/transform/join_epc_pp.py index 7ceccce..ebc0c35 100644 --- a/pipeline/transform/join_epc_pp.py +++ b/pipeline/transform/join_epc_pp.py @@ -26,6 +26,7 @@ MIN_PRICE = 50_000 EPC_SOURCE_COLUMNS = [ "address", "postcode", + "uprn", "current_energy_rating", "potential_energy_rating", "property_type", @@ -57,6 +58,8 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame: raw.select( _clean_string("address").alias("epc_address"), _clean_string("postcode").str.to_uppercase().alias("epc_postcode"), + # UPRN keys an exact listing->EPC join downstream (~99% populated). + _clean_string("uprn").alias("uprn"), _clean_string("current_energy_rating") .str.to_uppercase() .alias("current_energy_rating"), diff --git a/pipeline/transform/merge.py b/pipeline/transform/merge.py index 5dc21b0..2001140 100644 --- a/pipeline/transform/merge.py +++ b/pipeline/transform/merge.py @@ -48,7 +48,7 @@ _AREA_COLUMNS = [ "lon", # Runtime provenance for deciding whether missing coordinates are skippable. "ctry25cd", - # Keyed lookup for postcode-level side tables (e.g. crime time series). + # Join key for LSOA-level side tables (e.g. median age). "lsoa21", # Deprivation "Income Score", @@ -81,8 +81,6 @@ _AREA_COLUMNS = [ "Other crime (avg/yr)", "Serious crime (avg/yr)", "Minor crime (avg/yr)", - "Serious crime per 1k residents (avg/yr)", - "Minor crime per 1k residents (avg/yr)", # Amenities "Number of restaurants within 2km", "Number of grocery shops and supermarkets within 2km", @@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [ "Other", ] _EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"] -_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 -_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 -_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82 -_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96 -_PROPERTY_MATCH_MIN_MARGIN = 4.0 -_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 -_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 -_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0 -_DIRECT_EPC_NEARBY_RADIUS_M = 500.0 -_DIRECT_EPC_NEAREST_POSTCODES = 40 +# Listings are matched to EPC certificates and Price-Paid properties first by +# UPRN (exact) and otherwise by fuzzy street-address similarity within the same +# postcode. A house number in the listing address is the strong disambiguator, +# so a numbered listing may match on a lower street-similarity score than a +# number-less one (which must match the street almost exactly to be trusted). +_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82 +_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90 _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = ( ("_direct_epc_address", pl.Utf8), ("_direct_current_energy_rating", pl.Utf8), @@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = ( ("_direct_was_council_house", pl.Utf8), ("_direct_epc_match_status", pl.Utf8), ("_direct_epc_match_score", pl.Float32), - ("_direct_epc_match_margin", pl.Float32), + ("_direct_epc_match_method", pl.Utf8), ) _DIRECT_EPC_RAW_COLUMN_MAP = { "epc_address": "_direct_epc_address", @@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr: ) -def _ratio_bonus( - left: float | int | None, right: float | int | None, pct: float, cap: float -) -> float: - if left is None or right is None: - return 0.0 - try: - left_f = float(left) - right_f = float(right) - except (TypeError, ValueError): - return 0.0 - if left_f <= 0 or right_f <= 0: - return 0.0 - rel = abs(left_f - right_f) / max(left_f, right_f) - if rel > pct: - return 0.0 - return cap * (1.0 - rel / pct) - - -def _rooms_bonus(left: int | None, right: int | None) -> float: - if left is None or right is None: - return 0.0 - try: - diff = abs(int(left) - int(right)) - except (TypeError, ValueError): - return 0.0 - if diff == 0: - return 4.0 - if diff == 1: - return 2.0 - return 0.0 - - -def _enum_bonus( - left: str | None, right: str | None, *, exact: float, mismatch: float -) -> float: - if not left or not right: - return 0.0 - return exact if left == right else mismatch - - def _address_score(query: str, candidate: str | None) -> int: if not candidate: return 0 @@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool: return bool(address and _NUMBER_RE.search(address)) +def _normalize_uprn(value: object) -> str | None: + """Canonical UPRN string (digits only) or None. + + UPRNs arrive as strings or ints from the scraper / EPC register; normalise + so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype + or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is + rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be + silently mangled into a bogus all-digits key. + """ + if value is None: + return None + if isinstance(value, float): + if not value.is_integer(): + return None + value = int(value) + digits = re.sub(r"\D", "", str(value)) + return digits or None + + +def _best_listing_match( + listing_uprn: str | None, + query: str | None, + uprn_index: dict[str, dict], + bucket_candidates: list[dict], + addressed_fields: list[str], +) -> tuple[dict, float, str, str | None] | None: + """Pick the best candidate for a listing. + + Matching is, in order: (1) an exact UPRN equality against the global + ``uprn_index`` (postcode-independent, so it is robust even when the + listing's postcode is slightly off); (2) failing that, the highest + fuzzy street-address similarity within the listing's own postcode bucket. + No property-attribute heuristics are used — a house number in the listing + address gates the fuzzy match (`_numbers_compatible`) and lowers the score + threshold; a number-less address must match the street almost exactly. + + ``addressed_fields`` names the candidate columns to fuzzy-match against (a + candidate may carry both a register and an EPC address). Returns + ``(candidate, score, method, matched_field)`` or None. ``method`` is + "uprn" or "address"; ``matched_field`` is the winning address column (or + None for a UPRN match). + """ + if listing_uprn: + hit = uprn_index.get(listing_uprn) + if hit is not None: + return hit, 100.0, "uprn", None + + if not query: + return None + + listing_has_numbers = _has_number(query) + best: dict | None = None + best_score = 0 + best_field: str | None = None + for candidate in bucket_candidates: + for field in addressed_fields: + address = candidate.get(field) + if not address: + continue + if listing_has_numbers and not _numbers_compatible(query, address): + continue + score = _address_score(query, address) + if score > best_score: + best_score = score + best = candidate + best_field = field + + if best is None: + return None + threshold = ( + _LISTING_MATCH_MIN_SCORE_WITH_NUMBERS + if listing_has_numbers + else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS + ) + if best_score < threshold: + return None + return best, float(best_score), "address", best_field + + def _load_listings_for_merge( listings_path: Path, arcgis_path: Path ) -> pl.DataFrame: @@ -908,6 +942,20 @@ def _load_listings_for_merge( raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx") postcode_mapping = build_postcode_mapping(arcgis_path).lazy() + # UPRN is only present on scraped listings that carry it (Zoopla detail + # pages); tolerate its absence so older parquets and test fixtures still + # load. Digits-only so it compares equal to the EPC register's UPRN. + if "UPRN" in raw.collect_schema().names(): + uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "") + listing_uprn_expr = ( + pl.when(uprn_digits.str.len_chars() > 0) + .then(uprn_digits) + .otherwise(None) + .alias("_listing_uprn") + ) + else: + listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn") + # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars # treats NaN as distinct from null and the downstream `latest_price / # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats @@ -936,12 +984,14 @@ def _load_listings_for_merge( "postcode" ), pl.col("Address per Property Register").alias("pp_address"), + listing_uprn_expr, *overlay, ) .select( "_listing_idx", "postcode", "pp_address", + "_listing_uprn", *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES], ) .collect(engine="streaming") @@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame: def _load_direct_epc_candidates( epc_path: Path, - arcgis_path: Path, listing_outcodes: list[str], temp_dir: Path, ) -> pl.DataFrame: @@ -982,8 +1031,7 @@ def _load_direct_epc_candidates( "_direct_epc_match_postcode": pl.Utf8, "_direct_epc_outcode": pl.Utf8, "_direct_epc_canonical_property_type": pl.Utf8, - "_direct_epc_east": pl.Float64, - "_direct_epc_north": pl.Float64, + "_direct_epc_uprn": pl.Utf8, **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")}, } if not listing_outcodes: @@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates( .with_columns(pl.lit("Yes").alias("_direct_was_council_house")) ) - arcgis = pl.scan_parquet(arcgis_path).select( - normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"), - pl.col("east1m").alias("_direct_epc_east"), - pl.col("north1m").alias("_direct_epc_north"), - ) - return ( epc_base.sort("inspection_date", descending=True) .group_by("_direct_epc_match_address", "_direct_epc_match_postcode") @@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates( on=["_direct_epc_match_address", "_direct_epc_match_postcode"], how="left", ) - .join(arcgis, on="_direct_epc_match_postcode", how="left") .with_columns( _canonical_epc_property_type_expr().alias( "_direct_epc_canonical_property_type" @@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates( .otherwise(None) .alias("_direct_potential_energy_rating"), pl.col("epc_address").alias("_direct_epc_address"), + pl.col("uprn").alias("_direct_epc_uprn"), pl.col("total_floor_area").alias("_direct_total_floor_area"), pl.col("number_habitable_rooms").alias( "_direct_number_habitable_rooms" @@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates( "_direct_epc_match_postcode", "_direct_epc_outcode", "_direct_epc_canonical_property_type", - "_direct_epc_east", - "_direct_epc_north", + "_direct_epc_uprn", "_direct_epc_address", "_direct_current_energy_rating", "_direct_potential_energy_rating", @@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates( def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame: - match = listings.with_columns( + """Add the normalised address/postcode/outcode keys used to match listings. + + Listings are matched to EPC certificates and properties by UPRN and by + fuzzy street address within their (now accurate, detail-page-sourced) + postcode — never by coordinate proximity — so no projected easting/northing + is computed here. `_listing_uprn` flows through from the loaded listings. + """ + return listings.with_columns( normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"), normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"), ).with_columns( @@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame: .alias("_listing_outcode") ) - if match.is_empty(): - return match.with_columns( - pl.Series("_listing_east", [], dtype=pl.Float64), - pl.Series("_listing_north", [], dtype=pl.Float64), - ) - - transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) - east, north = transformer.transform( - match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy() - ) - return match.with_columns( - pl.Series("_listing_east", east, dtype=pl.Float64), - pl.Series("_listing_north", north, dtype=pl.Float64), - ) - def _optional_lazy_col( schema: pl.Schema, column: str, dtype: pl.DataType @@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]: "_matched_postcode": pl.Utf8, "_matched_pp_address": pl.Utf8, "_property_match_score": pl.Float32, - "_property_match_address_score": pl.Int32, - "_property_match_margin": pl.Float32, + "_property_match_method": pl.Utf8, "_property_match_field": pl.Utf8, } @@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame: pl.col("postcode").cast(pl.Utf8).alias("postcode"), pl.col("pp_address").cast(pl.Utf8).alias("pp_address"), _optional_lazy_col(schema, "epc_address", pl.Utf8), - _optional_lazy_col(schema, "pp_property_type", pl.Utf8), - _optional_lazy_col(schema, "duration", pl.Utf8), - _optional_lazy_col(schema, "total_floor_area", pl.Float64), - _optional_lazy_col(schema, "number_habitable_rooms", pl.Int16), - _optional_lazy_col(schema, "latest_price", pl.Int64), + # UPRN keys the exact match; present once epc_pp is rebuilt with it. + _optional_lazy_col(schema, "uprn", pl.Utf8), ) .with_row_index("_property_row") .with_columns( @@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame: ) -def _property_candidates_by_postcode( - candidates: pl.DataFrame, -) -> dict[str, list[dict]]: +def _index_candidates( + candidates: pl.DataFrame, postcode_key: str, uprn_key: str +) -> tuple[dict[str, list[dict]], dict[str, dict]]: + """Index candidate rows for matching, in a single pass over the frame. + + Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the + fuzzy street-address match; the UPRN index drives the exact match and is + postcode-independent, so it still resolves when a listing's postcode is + slightly off. + """ buckets: dict[str, list[dict]] = {} + uprn_index: dict[str, dict] = {} for row in candidates.iter_rows(named=True): - postcode = row.get("_property_match_postcode") + postcode = row.get(postcode_key) if postcode: buckets.setdefault(postcode, []).append(row) - return buckets + uprn = _normalize_uprn(row.get(uprn_key)) + if uprn and uprn not in uprn_index: + uprn_index[uprn] = row + return buckets, uprn_index def _best_listing_property_candidate( - listing: dict, candidates: list[dict] + listing: dict, uprn_index: dict[str, dict], candidates: list[dict] ) -> dict | None: - query = listing.get("_listing_match_address") - if not query: - return None - - listing_has_numbers = _has_number(query) - scored: list[tuple[float, int, dict, str]] = [] - for candidate in candidates: - register_address = candidate.get("_property_match_address") - epc_address = candidate.get("_property_epc_match_address") - register_numbers_compatible = bool( - register_address and _numbers_compatible(query, register_address) - ) - epc_numbers_compatible = bool( - epc_address and _numbers_compatible(query, epc_address) - ) - if not (register_numbers_compatible or epc_numbers_compatible): - continue - - register_score = _address_score(query, register_address) - epc_score = _address_score(query, epc_address) - base_score = max(register_score, epc_score) - if base_score == 0: - continue - - score = float(base_score) - score += _enum_bonus( - listing.get("_actual_property_type"), - candidate.get("pp_property_type"), - exact=7.0, - mismatch=-8.0, - ) - score += _enum_bonus( - listing.get("_actual_leasehold_freehold"), - candidate.get("duration"), - exact=3.0, - mismatch=-3.0, - ) - score += _ratio_bonus( - listing.get("_actual_total_floor_area"), - candidate.get("total_floor_area"), - pct=0.15, - cap=8.0, - ) - score += _rooms_bonus( - listing.get("_actual_number_habitable_rooms"), - candidate.get("number_habitable_rooms"), - ) - score += _ratio_bonus( - listing.get("_actual_asking_price"), - candidate.get("latest_price"), - pct=0.25, - cap=3.0, - ) - matched_field = ( - "pp_address" if register_score >= epc_score else "epc_address" - ) - scored.append((score, base_score, candidate, matched_field)) - - if not scored: - return None - scored.sort(key=lambda item: item[0], reverse=True) - top = scored[0] - runner_up = scored[1][0] if len(scored) > 1 else None - margin = top[0] - runner_up if runner_up is not None else top[0] - score_threshold = ( - _PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS - if listing_has_numbers - else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS + result = _best_listing_match( + listing.get("_listing_uprn"), + listing.get("_listing_match_address"), + uprn_index, + candidates, + ["_property_match_address", "_property_epc_match_address"], ) - address_threshold = ( - _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS - if listing_has_numbers - else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS - ) - if ( - top[0] < score_threshold - or top[1] < address_threshold - or margin < _PROPERTY_MATCH_MIN_MARGIN - ): + if result is None: return None - - candidate = top[2] + candidate, score, method, field = result + matched_field = { + "_property_match_address": "pp_address", + "_property_epc_match_address": "epc_address", + }.get(field, method) return { "_listing_idx": listing["_listing_idx"], "_matched_postcode": candidate.get("postcode"), "_matched_pp_address": candidate.get("pp_address"), - "_property_match_score": round(top[0], 1), - "_property_match_address_score": top[1], - "_property_match_margin": round(margin, 1), - "_property_match_field": top[3], + "_property_match_score": round(score, 1), + "_property_match_method": method, + "_property_match_field": matched_field, } @@ -1280,23 +1251,32 @@ def _match_listing_properties( if listing_matches.is_empty() or property_candidates.is_empty(): return _empty_listing_property_matches() - buckets = _property_candidates_by_postcode(property_candidates) + buckets, uprn_index = _index_candidates( + property_candidates, "_property_match_postcode", "uprn" + ) best_matches = [] for listing in listing_matches.iter_rows(named=True): postcode = listing.get("_listing_match_postcode") - if not postcode: - continue - match = _best_listing_property_candidate(listing, buckets.get(postcode, [])) + bucket = buckets.get(postcode, []) if postcode else [] + match = _best_listing_property_candidate(listing, uprn_index, bucket) if match is not None: best_matches.append(match) if not best_matches: return _empty_listing_property_matches() + # When two listings claim the same property, keep the most authoritative + # match: an exact UPRN match always wins over a fuzzy address match (both can + # score 100, so method must break the tie before score and listing index). matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema()) return ( matches.sort( - ["_property_match_score", "_listing_idx"], descending=[True, False] + [ + pl.col("_property_match_method") == "uprn", + "_property_match_score", + "_listing_idx", + ], + descending=[True, True, False], ) .unique( ["_matched_postcode", "_matched_pp_address"], @@ -1307,133 +1287,19 @@ def _match_listing_properties( ) -def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]: - buckets: dict[str, list[dict]] = {} - for row in candidates.iter_rows(named=True): - postcode = row.get("_direct_epc_match_postcode") - if postcode: - buckets.setdefault(postcode, []).append(row) - return buckets - - -def _epc_postcode_tree( - candidates: pl.DataFrame, -) -> tuple[cKDTree | None, list[str]]: - postcode_points = ( - candidates.select( - "_direct_epc_match_postcode", - "_direct_epc_east", - "_direct_epc_north", - ) - .drop_nulls() - .filter( - pl.col("_direct_epc_east").is_finite() - & pl.col("_direct_epc_north").is_finite() - ) - .unique("_direct_epc_match_postcode") +def _best_direct_epc_candidate( + listing: dict, uprn_index: dict[str, dict], candidates: list[dict] +) -> dict | None: + result = _best_listing_match( + listing.get("_listing_uprn"), + listing.get("_listing_match_address"), + uprn_index, + candidates, + ["_direct_epc_match_address"], ) - if postcode_points.is_empty(): - return None, [] - coords = np.column_stack( - [ - postcode_points["_direct_epc_east"].to_numpy(), - postcode_points["_direct_epc_north"].to_numpy(), - ] - ) - return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list() - - -def _candidate_postcodes_for_listing( - listing: dict, - postcode_tree: cKDTree | None, - postcode_values: list[str], -) -> list[str]: - postcodes: list[str] = [] - exact = listing.get("_listing_match_postcode") - if exact: - postcodes.append(exact) - - if postcode_tree is None: - return postcodes - - east = listing.get("_listing_east") - north = listing.get("_listing_north") - try: - east_f = float(east) - north_f = float(north) - except (TypeError, ValueError): - return postcodes - if not np.isfinite(east_f) or not np.isfinite(north_f): - return postcodes - - k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values)) - distances, indices = postcode_tree.query( - [east_f, north_f], - k=k, - distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M, - ) - distances = np.atleast_1d(distances) - indices = np.atleast_1d(indices) - seen = set(postcodes) - for distance, idx in zip(distances, indices, strict=False): - if not np.isfinite(distance) or idx >= len(postcode_values): - continue - postcode = postcode_values[int(idx)] - if postcode not in seen: - postcodes.append(postcode) - seen.add(postcode) - return postcodes - - -def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None: - query = listing.get("_listing_match_address") - if not query: + if result is None: return None - - listing_has_numbers = _has_number(query) - scored: list[tuple[float, int, dict]] = [] - for candidate in candidates: - address = candidate.get("_direct_epc_match_address") - if listing_has_numbers and not _numbers_compatible(query, address or ""): - continue - base_score = _address_score(query, address) - if base_score == 0: - continue - - score = float(base_score) - score += _enum_bonus( - listing.get("_actual_property_type"), - candidate.get("_direct_epc_canonical_property_type"), - exact=6.0, - mismatch=-6.0, - ) - score += _ratio_bonus( - listing.get("_actual_total_floor_area"), - candidate.get("_direct_total_floor_area"), - pct=0.12, - cap=8.0, - ) - score += _rooms_bonus( - listing.get("_actual_number_habitable_rooms"), - candidate.get("_direct_number_habitable_rooms"), - ) - scored.append((score, base_score, candidate)) - - if not scored: - return None - scored.sort(key=lambda item: item[0], reverse=True) - top = scored[0] - runner_up = scored[1][0] if len(scored) > 1 else None - margin = top[0] - runner_up if runner_up is not None else top[0] - threshold = ( - _DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS - if listing_has_numbers - else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS - ) - if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN: - return None - - candidate = top[2] + candidate, score, method, _field = result return { "_listing_idx": listing["_listing_idx"], "_direct_epc_address": candidate.get("_direct_epc_address"), @@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | ), "_direct_was_council_house": candidate.get("_direct_was_council_house"), "_direct_epc_match_status": "matched", - "_direct_epc_match_score": round(top[0], 1), - "_direct_epc_match_margin": round(margin, 1), + "_direct_epc_match_score": round(score, 1), + "_direct_epc_match_method": method, } @@ -1463,25 +1329,14 @@ def _match_direct_epc( if listing_matches.is_empty() or epc_candidates.is_empty(): return _empty_direct_epc_matches() - buckets = _epc_candidates_by_postcode(epc_candidates) - postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates) - + buckets, uprn_index = _index_candidates( + epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn" + ) matches = [] for listing in listing_matches.iter_rows(named=True): - candidate_postcodes = _candidate_postcodes_for_listing( - listing, postcode_tree, postcode_values - ) - candidate_rows: list[dict] = [] - seen_rows: set[int] = set() - for postcode in candidate_postcodes: - for candidate in buckets.get(postcode, []): - row = candidate.get("_direct_epc_row") - if row in seen_rows: - continue - candidate_rows.append(candidate) - if row is not None: - seen_rows.add(row) - match = _best_direct_epc_candidate(listing, candidate_rows) + postcode = listing.get("_listing_match_postcode") + bucket = buckets.get(postcode, []) if postcode else [] + match = _best_direct_epc_candidate(listing, uprn_index, bucket) if match is not None: matches.append(match) @@ -1493,7 +1348,6 @@ def _match_direct_epc( def _enrich_listings_with_direct_epc( listings: pl.DataFrame, epc_path: Path | None, - arcgis_path: Path, ) -> pl.DataFrame: if epc_path is None: return _ensure_direct_epc_columns(listings) @@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc( prefix="direct_listing_epc_", dir=local_tmp_dir() ) as tmpdir: epc_candidates = _load_direct_epc_candidates( - epc_path, arcgis_path, listing_outcodes, Path(tmpdir) + epc_path, listing_outcodes, Path(tmpdir) ) print(f"Direct listing EPC candidates: {epc_candidates.height}") direct_matches = _match_direct_epc(listing_matches, epc_candidates) @@ -1604,7 +1458,7 @@ def _integrate_listings( """ listings = _load_listings_for_merge(listings_path, arcgis_path) print(f"Listings loaded: {listings.height}") - listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path) + listings = _enrich_listings_with_direct_epc(listings, epc_path) overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES] listing_attachment_columns = [ @@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame: """Project the post-rename wide frame down to enriched-listing rows.""" df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null()) + # A matched listing's overlay attaches to every wide row sharing its + # (postcode, pp_address). The terminated-postcode remap can collapse several + # distinct wide rows onto one such key, which would otherwise emit one duplicate + # listing per collapsed row. Each listing matches exactly one (postcode, + # pp_address) and each seed row carries a unique URL, so keeping a single row per + # listing URL collapses only that fan-out and never merges distinct listings. + df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True) + df = df.with_columns( pl.col("_actual_listing_url").alias("Listing URL"), pl.col("_actual_listing_date").alias("Listing date"), @@ -1750,7 +1612,6 @@ def _build( broadband_path: Path, conservation_areas_path: Path, rental_prices_path: Path, - lsoa_population_path: Path, median_age_path: Path, election_results_path: Path, tree_density_postcodes_path: Path | None = None, @@ -1881,8 +1742,10 @@ def _build( how="left", ) + # Crime is counted spatially per postcode (incidents within 50m of the + # postcode boundary), so it joins on postcode rather than LSOA. crime = pl.scan_parquet(crime_path) - wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left") + wide = wide.join(crime, on="postcode", how="left") wide = wide.with_columns( pl.sum_horizontal( @@ -1905,17 +1768,6 @@ def _build( ).alias("minor_crime_avg_yr"), ) - lsoa_pop = pl.scan_parquet(lsoa_population_path) - wide = wide.join(lsoa_pop, on="lsoa21", how="left") - wide = wide.with_columns( - pl.when(pl.col("population") > 0) - .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1)) - .alias("serious_crime_per_1k"), - pl.when(pl.col("population") > 0) - .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1)) - .alias("minor_crime_per_1k"), - ).drop("population") - median_age = pl.scan_parquet(median_age_path) wide = wide.join(median_age, on="lsoa21", how="left") @@ -2082,8 +1934,6 @@ def _build( "max_download_speed": "Max available download speed (Mbps)", "serious_crime_avg_yr": "Serious crime (avg/yr)", "minor_crime_avg_yr": "Minor crime (avg/yr)", - "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)", - "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)", "mean_monthly_rent": "Estimated monthly rent", "floor_height": "Interior height (m)", "was_council_house": "Former council house", @@ -2189,12 +2039,6 @@ def main(): required=True, help="ONS rental prices by LA and bedroom count parquet file", ) - parser.add_argument( - "--lsoa-population", - type=Path, - required=True, - help="Census 2021 population by LSOA parquet file", - ) parser.add_argument( "--median-age", type=Path, @@ -2279,7 +2123,6 @@ def main(): broadband_path=args.broadband, conservation_areas_path=args.conservation_areas, rental_prices_path=args.rental_prices, - lsoa_population_path=args.lsoa_population, median_age_path=args.median_age, election_results_path=args.election_results, tree_density_postcodes_path=args.tree_density_postcodes, diff --git a/pipeline/transform/noise_overlay_tiles.py b/pipeline/transform/noise_overlay_tiles.py index 6ca8a40..b242fad 100644 --- a/pipeline/transform/noise_overlay_tiles.py +++ b/pipeline/transform/noise_overlay_tiles.py @@ -376,7 +376,7 @@ def main() -> None: "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles") ) parser.add_argument("--pmtiles-version", default="1.22.3") - parser.add_argument("--min-zoom", type=int, default=13) + parser.add_argument("--min-zoom", type=int, default=12) parser.add_argument("--max-zoom", type=int, default=14) parser.add_argument("--tile-size", type=int, default=256) args = parser.parse_args() diff --git a/pipeline/transform/postcode_boundaries/__main__.py b/pipeline/transform/postcode_boundaries/__main__.py index 9e2450c..70a70dd 100644 --- a/pipeline/transform/postcode_boundaries/__main__.py +++ b/pipeline/transform/postcode_boundaries/__main__.py @@ -22,6 +22,12 @@ def main() -> None: description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data" ) parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet") + parser.add_argument( + "--arcgis", + type=Path, + default=None, + help="Optional ArcGIS postcode parquet used to remap terminated postcodes", + ) parser.add_argument( "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage" ) @@ -46,7 +52,7 @@ def main() -> None: print("=" * 60) oa_geoms = load_oa_boundaries(args.oa_boundaries) - uprn_df, uprn_offsets = load_uprns(args.uprn) + uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis) # Phase 2: Parse/load INSPIRE print() diff --git a/pipeline/transform/postcode_boundaries/loader.py b/pipeline/transform/postcode_boundaries/loader.py new file mode 100644 index 0000000..c495fac --- /dev/null +++ b/pipeline/transform/postcode_boundaries/loader.py @@ -0,0 +1,105 @@ +"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons. + +The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per +postcode district under ``units/{district}.geojson``, each feature carrying a +``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms +that test points against postcode geometry want those polygons back in British +National Grid (EPSG:27700) so buffers/distances are in metres. + +:func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs +invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by +postcode so callers can use the array index as a stable postcode id -- the same +"buffer index == postcode index" convention used by ``tree_density``. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import numpy as np +import shapely +from pyproj import Transformer + + +def _read_district( + path: Path, transformer: Transformer +) -> tuple[np.ndarray, np.ndarray]: + """Return (postcodes, polygons_27700) for one district GeoJSON.""" + with path.open() as file: + collection = json.load(file) + + features = collection.get("features", []) + if not features: + return np.empty(0, dtype=object), np.empty(0, dtype=object) + + postcodes = np.array( + [feature["properties"]["postcodes"] for feature in features], dtype=object + ) + geom_json = np.array( + [json.dumps(feature["geometry"]) for feature in features], dtype=object + ) + geoms = shapely.from_geojson(geom_json) + + # Reproject every vertex in a single pyproj call, then rebuild the polygons. + coords = shapely.get_coordinates(geoms) + if coords.size: + x, y = transformer.transform(coords[:, 0], coords[:, 1]) + geoms = shapely.set_coordinates(geoms, np.column_stack([x, y])) + + invalid = ~shapely.is_valid(geoms) + if invalid.any(): + geoms[invalid] = shapely.make_valid(geoms[invalid]) + + return postcodes, geoms + + +def load_postcode_polygons( + units_dir: Path, max_postcodes: int | None = None +) -> tuple[np.ndarray, np.ndarray]: + """Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700. + + Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode. + ``max_postcodes`` (testing) keeps only the lexicographically-first N + postcodes, reading just enough district files to reach the cap. + """ + units_dir = Path(units_dir) + files = sorted(units_dir.glob("*.geojson")) + if not files: + raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}") + + transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) + postcode_chunks: list[np.ndarray] = [] + geom_chunks: list[np.ndarray] = [] + total = 0 + for path in files: + postcodes, geoms = _read_district(path, transformer) + if len(postcodes) == 0: + continue + postcode_chunks.append(postcodes) + geom_chunks.append(geoms) + total += len(postcodes) + if max_postcodes is not None and total >= max_postcodes: + break + + if not postcode_chunks: + raise ValueError(f"No postcode features found in {units_dir}") + + postcodes = np.concatenate(postcode_chunks) + geoms = np.concatenate(geom_chunks) + + # Stable postcode order makes "index == postcode id" deterministic; dedupe + # defensively (a postcode lives in exactly one district file). + order = np.argsort(postcodes, kind="stable") + postcodes = postcodes[order] + geoms = geoms[order] + _, first = np.unique(postcodes, return_index=True) + postcodes = postcodes[first] + geoms = geoms[first] + + if max_postcodes is not None and len(postcodes) > max_postcodes: + postcodes = postcodes[:max_postcodes] + geoms = geoms[:max_postcodes] + + print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}") + return postcodes, geoms diff --git a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py index 99df1ce..7d83b80 100644 --- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py +++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py @@ -121,6 +121,50 @@ class TestWhitespacePostcodes: loaded_df, _ = load_uprns(path) assert len(loaded_df) == 0 + def test_non_english_oas_excluded(self, tmp_path): + df = pl.DataFrame( + { + "GRIDGB1E": [500010, 300010], + "GRIDGB1N": [180010, 220010], + "PCDS": ["AA1 1AA", "CF1 1AA"], + "OA21CD": ["E00000001", "W00000001"], + } + ) + path = tmp_path / "uprn.parquet" + df.write_parquet(path) + + loaded_df, offsets = load_uprns(path) + + assert set(offsets) == {"E00000001"} + assert loaded_df["PCDS"].to_list() == ["AA1 1AA"] + + def test_terminated_postcodes_are_remapped(self, tmp_path): + uprns = pl.DataFrame( + { + "GRIDGB1E": [500010], + "GRIDGB1N": [180010], + "PCDS": ["aa1 1aa"], + "OA21CD": ["E00000001"], + } + ) + uprn_path = tmp_path / "uprn.parquet" + uprns.write_parquet(uprn_path) + arcgis = pl.DataFrame( + { + "pcds": ["AA1 1AA", "AA1 1AB"], + "east1m": [500010, 500030], + "north1m": [180010, 180020], + "doterm": ["2020-01-01", None], + "ctry25cd": ["E92000001", "E92000001"], + } + ) + arcgis_path = tmp_path / "arcgis.parquet" + arcgis.write_parquet(arcgis_path) + + loaded_df, _offsets = load_uprns(uprn_path, arcgis_path) + + assert loaded_df["PCDS"].to_list() == ["AA1 1AB"] + # --------------------------------------------------------------------------- # Bug 3: Voronoi deduplication is first-seen-wins diff --git a/pipeline/transform/postcode_boundaries/uprn.py b/pipeline/transform/postcode_boundaries/uprn.py index a1f4bdc..d4c7557 100644 --- a/pipeline/transform/postcode_boundaries/uprn.py +++ b/pipeline/transform/postcode_boundaries/uprn.py @@ -4,11 +4,18 @@ import numpy as np import polars as pl from pipeline.local_temp import local_tmp_dir +from pipeline.utils.postcode_mapping import build_postcode_mapping from .memory import release_memory -def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]: +def _canonical_postcode_expr(name: str) -> pl.Expr: + return pl.col(name).str.strip_chars().str.to_uppercase() + + +def load_uprns( + uprn_path: Path, arcgis_path: Path | None = None +) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]: """Load UPRNs as a sorted polars DataFrame with OA offset lookup. Returns (df, offsets) where offsets[oa_code] = (start_row, end_row). @@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int] import tempfile print("Loading UPRN lookup...") + mapping = None + if arcgis_path is not None: + mapping = ( + build_postcode_mapping(arcgis_path) + .with_columns( + _canonical_postcode_expr("old_postcode").alias("old_postcode"), + _canonical_postcode_expr("new_postcode").alias("new_postcode"), + ) + .unique("old_postcode") + ) # Sort via streaming sink to avoid polars doubling memory during in-memory sort with tempfile.NamedTemporaryFile( suffix=".parquet", delete=False, dir=local_tmp_dir() ) as tmp: tmp_path = Path(tmp.name) - ( + uprns = ( pl.scan_parquet(uprn_path) .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD") - .filter(~pl.col("OA21CD").str.starts_with("S")) + .filter(pl.col("OA21CD").str.starts_with("E")) .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null()) - .with_columns(pl.col("PCDS").str.strip_chars()) + .with_columns(_canonical_postcode_expr("PCDS").alias("PCDS")) .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != "")) - .sort("OA21CD") - .sink_parquet(tmp_path) ) + + if mapping is not None and mapping.height > 0: + uprns = ( + uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left") + .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS")) + .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD") + ) + + uprns.sort("OA21CD").sink_parquet(tmp_path) release_memory() # Read the sorted data — only one copy in memory (~2GB) df = pl.read_parquet(tmp_path) tmp_path.unlink() n = len(df) - print(f" Loaded {n:,} UPRNs (England & Wales)") + print(f" Loaded {n:,} UPRNs (England)") # Compute OA group offsets using polars (avoids 37M Python string creation) boundary_df = ( diff --git a/pipeline/transform/property_border_tiles.py b/pipeline/transform/property_border_tiles.py new file mode 100644 index 0000000..3ed764d --- /dev/null +++ b/pipeline/transform/property_border_tiles.py @@ -0,0 +1,138 @@ +"""Build PMTiles polygon tiles for the INSPIRE property-border overlay. + +Reads the HM Land Registry INSPIRE Index Polygons (per-local-authority GML ZIPs +in EPSG:27700), reprojects each parcel to WGS84, and tiles the outlines with +tippecanoe. The dashboard serves the resulting archive through +``/api/overlays/property-borders`` and renders it as thin outlines only at the +postcode zoom level. + +The same ZIPs are already downloaded for postcode-boundary generation; this +target re-uses :func:`parse_inspire_zip` to stay self-contained and is wired to +the ``$(INSPIRE_STAMP)`` make dependency rather than the boundary cache. + +Data: HM Land Registry INSPIRE Index Polygons, Open Government Licence v3.0. +Boundaries are indicative "general boundaries", not the legal extent of title. +""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import tempfile +from pathlib import Path + +import numpy as np +import shapely +from pyproj import Transformer +from shapely.geometry import Polygon +from tqdm import tqdm + +from pipeline.local_temp import local_tmp_dir +from pipeline.transform.postcode_boundaries.inspire import parse_inspire_zip + + +def _require_tippecanoe() -> str: + executable = shutil.which("tippecanoe") + if executable is None: + raise RuntimeError( + "tippecanoe is required to build property border PMTiles. " + "Install tippecanoe and rerun this target." + ) + return executable + + +def _write_property_geojsonseq(inspire_dir: Path, output_path: Path) -> int: + """Stream INSPIRE parcels to a WGS84 GeoJSONSeq file, one feature per line. + + Features carry no properties — the overlay only draws outlines, so dropping + attributes keeps the tiles as small as possible. Reprojection and GeoJSON + encoding are vectorised per ZIP (one local authority) to bound memory while + staying in shapely's C path. + """ + to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) + zip_files = sorted(inspire_dir.glob("*.zip")) + if not zip_files: + raise RuntimeError(f"No INSPIRE ZIP files found in {inspire_dir}") + + feature_count = 0 + with output_path.open("w") as file: + for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"): + rings = parse_inspire_zip(zip_path) # list of Nx2 (easting, northing) + if not rings: + continue + + geoms = np.array([Polygon(coords) for coords in rings], dtype=object) + # interleaved=False → transform(x, y) called once with full arrays. + geoms = shapely.transform(geoms, to_wgs84.transform, interleaved=False) + + for geometry_json in shapely.to_geojson(geoms): + file.write('{"type":"Feature","properties":{},"geometry":') + file.write(geometry_json) + file.write("}\n") + feature_count += 1 + + return feature_count + + +def build_property_border_tiles( + inspire_dir: Path, + output_path: Path, + min_zoom: int, + max_zoom: int, +) -> None: + tippecanoe = _require_tippecanoe() + output_path.parent.mkdir(parents=True, exist_ok=True) + + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: + ndjson_path = Path(tmp) / "property_borders.geojsonseq" + feature_count = _write_property_geojsonseq(inspire_dir, ndjson_path) + print(f"Writing {feature_count:,} INSPIRE parcel polygons") + + subprocess.run( + [ + tippecanoe, + "--force", + "--output", + str(output_path), + "--layer", + "property_borders", + "--minimum-zoom", + str(min_zoom), + "--maximum-zoom", + str(max_zoom), + # Borders are only meaningful at street level; thin the densest + # tiles at low zoom but keep full geometry at max zoom. + "--drop-smallest-as-needed", + "--simplify-only-low-zooms", + "--extend-zooms-if-still-dropping", + "--temporary-directory", + tmp, + str(ndjson_path), + ], + check=True, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--inspire", type=Path, required=True, help="INSPIRE ZIP directory" + ) + parser.add_argument( + "--output", type=Path, required=True, help="Output .pmtiles path" + ) + parser.add_argument("--min-zoom", type=int, default=12) + parser.add_argument("--max-zoom", type=int, default=16) + args = parser.parse_args() + + build_property_border_tiles( + inspire_dir=args.inspire, + output_path=args.output, + min_zoom=args.min_zoom, + max_zoom=args.max_zoom, + ) + + +if __name__ == "__main__": + main() diff --git a/pipeline/transform/test_crime_spatial.py b/pipeline/transform/test_crime_spatial.py new file mode 100644 index 0000000..b180008 --- /dev/null +++ b/pipeline/transform/test_crime_spatial.py @@ -0,0 +1,147 @@ +import json + +import polars as pl +from pyproj import Transformer + +from pipeline.transform.crime_spatial import transform_crime_spatial + +_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) + +_CSV_HEADER = ( + "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location," + "LSOA code,LSOA name,Crime type,Last outcome category,Context" +) + + +def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]: + lon, lat = _TO_WGS84.transform(x, y) + return lon, lat + + +def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict: + ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)] + coords = [list(_bng_to_wgs84(x, y)) for x, y in ring] + return { + "type": "Feature", + "properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")}, + "geometry": {"type": "Polygon", "coordinates": [coords]}, + } + + +def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None: + units_dir.mkdir(parents=True) + for district, features in features_by_district.items(): + collection = {"type": "FeatureCollection", "features": features} + (units_dir / f"{district}.geojson").write_text(json.dumps(collection)) + + +def _crime_row(month: str, x, y, crime_type: str) -> str: + if x is None or y is None: + lon, lat = "", "" + else: + lon, lat = _bng_to_wgs84(x, y) + return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U," + + +def _write_month(crime_dir, month: str, rows: list[str]) -> None: + month_dir = crime_dir / month + month_dir.mkdir(parents=True) + body = "\n".join([_CSV_HEADER, *rows]) + "\n" + (month_dir / f"{month}-test-force-street.csv").write_text(body) + + +def test_buffer_overlap_counts_for_each_postcode(tmp_path): + units = tmp_path / "units" + # A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060]. + _write_boundaries( + units, + { + "AB1": [ + _square_feature("AB1 1AA", 1000, 1000, 1010, 1010), + _square_feature("AB1 1AB", 1080, 1000, 1090, 1010), + _square_feature("AB1 1AC", 5000, 5000, 5010, 5010), + ] + }, + ) + + crime = tmp_path / "crime" + _write_month( + crime, + "2024-01", + [ + # In the overlap: 35m east of A, 35m west of B -> counts for both. + _crime_row("2024-01", 1045, 1005, "Burglary"), + # 49m east of C's edge -> inside C's buffer. + _crime_row("2024-01", 5059, 5005, "Robbery"), + # 51m east of C's edge -> outside every buffer. + _crime_row("2024-01", 5061, 5005, "Robbery"), + # No coordinate -> dropped entirely. + _crime_row("2024-01", None, None, "Anti-social behaviour"), + ], + ) + + output = tmp_path / "crime_by_postcode.parquet" + by_year = tmp_path / "crime_by_postcode_by_year.parquet" + transform_crime_spatial(crime, units, output, by_year) + + rows = { + r["postcode"]: r + for r in pl.read_parquet(output).to_dicts() + } + # Single month -> annualised x12. + assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0 + assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0 + assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0 + # Only the 49m robbery counts for C; the 51m one and the blank row do not. + assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0 + assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0 + # Anti-social behaviour had no coordinate -> nobody gets it. + assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values()) + + +def test_by_year_annualises_and_rolls_up(tmp_path): + units = tmp_path / "units" + _write_boundaries( + units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} + ) + + crime = tmp_path / "crime" + # Point at the centre of AB1 1AA, well inside its buffer. + _write_month( + crime, + "2023-01", + [ + _crime_row("2023-01", 1005, 1005, "Burglary"), + _crime_row("2023-01", 1005, 1005, "Robbery"), + ], + ) + _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")]) + _write_month( + crime, + "2024-02", + [ + _crime_row("2024-02", 1005, 1005, "Burglary"), + _crime_row("2024-02", 1005, 1005, "Anti-social behaviour"), + ], + ) + + output = tmp_path / "crime_by_postcode.parquet" + by_year = tmp_path / "crime_by_postcode_by_year.parquet" + transform_crime_spatial(crime, units, output, by_year) + + by_year_df = pl.read_parquet(by_year) + assert by_year_df.height == 1 + cols = set(by_year_df.columns) + assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols + + row = by_year_df.row(0, named=True) + burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"]) + # 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr. + assert burglary == [ + {"year": 2023, "count": 12.0}, + {"year": 2024, "count": 12.0}, + ] + serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]} + # 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12). + assert serious[2023] == 24.0 + assert serious[2024] == 12.0 diff --git a/pipeline/transform/test_join_epc_pp.py b/pipeline/transform/test_join_epc_pp.py index f4a52b3..67c0752 100644 --- a/pipeline/transform/test_join_epc_pp.py +++ b/pipeline/transform/test_join_epc_pp.py @@ -24,6 +24,7 @@ def _row(**overrides: str) -> dict[str, str]: row = { "address": "1 Example Street", "postcode": " aa1 1aa ", + "uprn": "100012345678", "current_energy_rating": "c", "potential_energy_rating": "b", "property_type": "House", @@ -52,6 +53,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path): { "epc_address": "1 Example Street", "epc_postcode": "AA1 1AA", + "uprn": "100012345678", "current_energy_rating": "C", "potential_energy_rating": "B", "epc_property_type": "House", diff --git a/pipeline/transform/test_merge.py b/pipeline/transform/test_merge.py index 2597730..702c425 100644 --- a/pipeline/transform/test_merge.py +++ b/pipeline/transform/test_merge.py @@ -15,6 +15,8 @@ from pipeline.transform.merge import ( _finalize_listings, _integrate_listings, _match_direct_epc, + _match_listing_properties, + _normalize_uprn, _is_dynamic_poi_metric_column, _less_deprived_percentile_expr, _load_conservation_area_geometries, @@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None: assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS +def test_crime_columns_are_spatial_counts_not_per_capita() -> None: + # Crime is now a raw spatial count per postcode; the per-1k-residents + # variants were dropped along with the LSOA population denominator. + assert "Serious crime (avg/yr)" in _AREA_COLUMNS + assert "Minor crime (avg/yr)" in _AREA_COLUMNS + assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS + assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS + + def test_listed_building_feature_is_property_level() -> None: assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS @@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks( assert seed["was_council_house"].to_list() == ["No"] -def test_match_direct_epc_considers_nearby_postcodes() -> None: - listing_matches = pl.DataFrame( - { - "_listing_idx": [0], - "_listing_match_address": ["1 EXAMPLE ROAD"], - "_listing_match_postcode": ["AA11AA"], - "_listing_east": [1000.0], - "_listing_north": [1000.0], - "_actual_property_type": ["Terraced"], - "_actual_total_floor_area": [100.0], - "_actual_number_habitable_rooms": [4], - }, - schema={ - "_listing_idx": pl.UInt32, - "_listing_match_address": pl.Utf8, - "_listing_match_postcode": pl.Utf8, - "_listing_east": pl.Float64, - "_listing_north": pl.Float64, - "_actual_property_type": pl.Utf8, - "_actual_total_floor_area": pl.Float64, - "_actual_number_habitable_rooms": pl.Int16, - }, - ) - epc_candidates = pl.DataFrame( - { - "_direct_epc_row": [0], - "_direct_epc_match_address": ["1 EXAMPLE ROAD"], - "_direct_epc_match_postcode": ["BB11BB"], - "_direct_epc_east": [1020.0], - "_direct_epc_north": [1010.0], - "_direct_epc_canonical_property_type": ["Terraced"], - "_direct_epc_address": ["1, Example Road"], - "_direct_current_energy_rating": ["C"], - "_direct_potential_energy_rating": ["B"], - "_direct_total_floor_area": [101.0], - "_direct_number_habitable_rooms": [4], - "_direct_floor_height": [2.5], - "_direct_construction_age_band": [1930], - "_direct_is_construction_date_approximate": [1], - "_direct_was_council_house": ["No"], - }, - schema={ - "_direct_epc_row": pl.UInt32, - "_direct_epc_match_address": pl.Utf8, - "_direct_epc_match_postcode": pl.Utf8, - "_direct_epc_east": pl.Float64, - "_direct_epc_north": pl.Float64, - "_direct_epc_canonical_property_type": pl.Utf8, - "_direct_epc_address": pl.Utf8, - "_direct_current_energy_rating": pl.Utf8, - "_direct_potential_energy_rating": pl.Utf8, - "_direct_total_floor_area": pl.Float64, - "_direct_number_habitable_rooms": pl.Int16, - "_direct_floor_height": pl.Float64, - "_direct_construction_age_band": pl.UInt16, - "_direct_is_construction_date_approximate": pl.UInt8, - "_direct_was_council_house": pl.Utf8, - }, +_DIRECT_EPC_CANDIDATE_SCHEMA = { + "_direct_epc_row": pl.UInt32, + "_direct_epc_match_address": pl.Utf8, + "_direct_epc_match_postcode": pl.Utf8, + "_direct_epc_outcode": pl.Utf8, + "_direct_epc_canonical_property_type": pl.Utf8, + "_direct_epc_uprn": pl.Utf8, + "_direct_epc_address": pl.Utf8, + "_direct_current_energy_rating": pl.Utf8, + "_direct_potential_energy_rating": pl.Utf8, + "_direct_total_floor_area": pl.Float64, + "_direct_number_habitable_rooms": pl.Int16, + "_direct_floor_height": pl.Float64, + "_direct_construction_age_band": pl.UInt16, + "_direct_is_construction_date_approximate": pl.UInt8, + "_direct_was_council_house": pl.Utf8, +} + +_LISTING_MATCH_SCHEMA = { + "_listing_idx": pl.UInt32, + "_listing_match_address": pl.Utf8, + "_listing_match_postcode": pl.Utf8, + "_listing_uprn": pl.Utf8, +} + + +def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame: + base = { + "_direct_epc_row": 0, + "_direct_epc_match_address": "1 EXAMPLE ROAD", + "_direct_epc_match_postcode": "AA11AA", + "_direct_epc_outcode": "AA1", + "_direct_epc_canonical_property_type": "Terraced", + "_direct_epc_uprn": None, + "_direct_epc_address": "1, Example Road", + "_direct_current_energy_rating": "C", + "_direct_potential_energy_rating": "B", + "_direct_total_floor_area": 101.0, + "_direct_number_habitable_rooms": 4, + "_direct_floor_height": 2.5, + "_direct_construction_age_band": 1930, + "_direct_is_construction_date_approximate": 1, + "_direct_was_council_house": "No", + } + return pl.DataFrame( + [{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA ) - matches = _match_direct_epc(listing_matches, epc_candidates) + +def _listing_matches(rows: list[dict]) -> pl.DataFrame: + base = { + "_listing_idx": 0, + "_listing_match_address": "1 EXAMPLE ROAD", + "_listing_match_postcode": "AA11AA", + "_listing_uprn": None, + } + return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA) + + +def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None: + # UPRN is matched globally (not within a postcode bucket), so a listing + # whose detail-page postcode is slightly off still resolves to the right + # EPC certificate by its UPRN. + matches = _match_direct_epc( + _listing_matches( + [{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}] + ), + _direct_epc_candidates( + [{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}] + ), + ) assert matches.height == 1 - assert matches["_listing_idx"].to_list() == [0] assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] + assert matches["_direct_epc_match_method"].to_list() == ["uprn"] + + +def test_match_direct_epc_matches_by_address_in_same_postcode() -> None: + matches = _match_direct_epc( + _listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]), + _direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]), + ) + + assert matches.height == 1 + assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] + assert matches["_direct_epc_match_method"].to_list() == ["address"] + + +def test_normalize_uprn_handles_types_and_floats() -> None: + assert _normalize_uprn(None) is None + assert _normalize_uprn("") is None + assert _normalize_uprn(" 100012345678 ") == "100012345678" + assert _normalize_uprn(100012345678) == "100012345678" + # An integral float normalises to its digits, NOT "1230". + assert _normalize_uprn(123.0) == "123" + # Non-integral / NaN floats are rejected rather than mangled. + assert _normalize_uprn(1.5) is None + assert _normalize_uprn(float("nan")) is None + + +def _property_candidates(rows: list[dict]) -> pl.DataFrame: + base = { + "postcode": "AA1 1AA", + "pp_address": "1 Example Road", + "_property_match_postcode": "AA11AA", + "_property_match_address": "1 EXAMPLE ROAD", + "_property_epc_match_address": "1 EXAMPLE ROAD", + "uprn": None, + } + return pl.DataFrame( + [{**base, **row} for row in rows], + schema={ + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "_property_match_postcode": pl.Utf8, + "_property_match_address": pl.Utf8, + "_property_epc_match_address": pl.Utf8, + "uprn": pl.Utf8, + }, + ) + + +def test_match_listing_properties_uprn_wins_dedup_tie() -> None: + # Two listings claim the same property: one by UPRN, one by exact address + # (both score 100). The UPRN match must win even though it has the higher + # _listing_idx (which would otherwise break the tie the wrong way). + listings = _listing_matches( + [ + { + "_listing_idx": 5, + "_listing_uprn": "100000000001", + "_listing_match_address": "SOMETHING ELSE", + }, + { + "_listing_idx": 1, + "_listing_uprn": None, + "_listing_match_address": "1 EXAMPLE ROAD", + }, + ] + ) + matches = _match_listing_properties( + listings, _property_candidates([{"uprn": "100000000001"}]) + ) + + assert matches.height == 1 + assert matches["_listing_idx"].to_list() == [5] + assert matches["_property_match_method"].to_list() == ["uprn"] + + +def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None: + # Matching is by postcode/UPRN/street — never by coordinate proximity — so a + # same-street EPC in a different postcode with no shared UPRN is skipped. + matches = _match_direct_epc( + _listing_matches([{"_listing_match_postcode": "AA11AA"}]), + _direct_epc_candidates( + [{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}] + ), + ) + + assert matches.height == 0 def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None: @@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) - assert other["_actual_listing_url"].to_list() == [None] -def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None: +def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None: + # The listing's address deliberately does not match the property's, but the + # shared UPRN drives an exact match anyway (UPRN beats fuzzy street). listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( - pl.lit("Rose Cottage High Street").alias("Address per Property Register"), + pl.lit("Totally Different Road").alias("Address per Property Register"), + pl.lit("100000000009").alias("UPRN"), + ).write_parquet(listings_path) + _stub_arcgis(arcgis_path) + wide = pl.DataFrame( + { + "postcode": ["SW1A 1AA"], + "pp_address": ["1 Example Road"], + "uprn": ["100000000009"], + "pp_property_type": ["Terraced"], + "duration": ["Freehold"], + "total_floor_area": [90.0], + "number_habitable_rooms": [4], + "latest_price": [600_000], + "epc_address": ["1 Example Road"], + "current_energy_rating": ["C"], + "potential_energy_rating": ["B"], + "floor_height": [2.4], + "construction_age_band": [1930], + "is_construction_date_approximate": [1], + "was_council_house": ["No"], + }, + schema={ + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "uprn": pl.Utf8, + "pp_property_type": pl.Utf8, + "duration": pl.Utf8, + "total_floor_area": pl.Float64, + "number_habitable_rooms": pl.Int16, + "latest_price": pl.Int64, + "epc_address": pl.Utf8, + "current_energy_rating": pl.Utf8, + "potential_energy_rating": pl.Utf8, + "floor_height": pl.Float64, + "construction_age_band": pl.UInt16, + "is_construction_date_approximate": pl.UInt8, + "was_council_house": pl.Utf8, + }, + ) + + integrated = _integrate_listings( + wide.lazy(), listings_path, arcgis_path, epc_path=None + ).collect() + + matched = integrated.filter(pl.col("pp_address") == "1 Example Road") + # The listing overlay attached to the UPRN-matched property row. + assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"] + # No spurious seed row for the listing's (non-matching) address. + assert "Totally Different Road" not in integrated["pp_address"].to_list() + + +def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None: + # A number-less listing whose street is not the property's street (and which + # shares no UPRN) must not be force-matched onto it; it becomes its own seed + # row instead of stamping the wrong property's overlay. + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().with_columns( + pl.lit("Juniper Crescent").alias("Address per Property Register"), ).write_parquet(listings_path) _stub_arcgis(arcgis_path) wide = pl.DataFrame( @@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> ).collect() existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street") - seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street") + seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent") assert existing["_actual_listing_url"].to_list() == [None] assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] @@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows( # Overlay scaffolding is dropped. for src, dst, _dt in _LISTING_OVERLAY_SOURCES: assert dst not in finalized.columns, src + + +def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None: + # The terminated-postcode remap can collapse two distinct wide rows onto the same + # (postcode, pp_address), so a single matched listing attaches to both. Finalize + # must emit one row per listing URL, not one per collapsed wide row. + df = pl.DataFrame( + { + "Postcode": ["SW1A 1AA", "SW1A 1AA"], + "Address per Property Register": ["1 Example Road", "1 Example Road"], + "Address per EPC": ["1 Example Road", "1 Example Road"], + "Date of last transaction": [1990.0, 1995.0], + "lat": [51.5, 51.5], + "lon": [-0.1, -0.1], + "Total floor area (sqm)": [100.0, 95.0], + "Number of bedrooms & living rooms": [3, 3], + "Property type": ["Terraced", "Terraced"], + "Leasehold/Freehold": ["Leasehold", "Leasehold"], + "Last known price": [500_000, 480_000], + "Street tree density percentile": [42.0, 42.0], + # Same listing URL on both collapsed rows — the fan-out to fix. + "_actual_listing_url": ["url0", "url0"], + "_actual_asking_price": [600_000, 600_000], + "_actual_asking_price_per_sqm": [5_000, 5_000], + "_actual_listing_date": [None, None], + "_actual_listing_status": ["For sale", "For sale"], + "_actual_listing_features": [["Garden"], ["Garden"]], + "_actual_bedrooms": [3, 3], + "_actual_bathrooms": [1, 1], + "_actual_price_qualifier": ["", ""], + "_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"], + "_actual_lat": [51.51, 51.51], + "_actual_lon": [-0.11, -0.11], + "_actual_total_floor_area": [110.0, 110.0], + "_actual_number_habitable_rooms": [4, 4], + "_actual_property_type": ["Terraced", "Terraced"], + "_actual_leasehold_freehold": ["Freehold", "Freehold"], + }, + schema={ + "Postcode": pl.Utf8, + "Address per Property Register": pl.Utf8, + "Address per EPC": pl.Utf8, + "Date of last transaction": pl.Float64, + "lat": pl.Float64, + "lon": pl.Float64, + "Total floor area (sqm)": pl.Float64, + "Number of bedrooms & living rooms": pl.Int16, + "Property type": pl.Utf8, + "Leasehold/Freehold": pl.Utf8, + "Last known price": pl.Int64, + "Street tree density percentile": pl.Float32, + "_actual_listing_url": pl.Utf8, + "_actual_asking_price": pl.Int64, + "_actual_asking_price_per_sqm": pl.Int32, + "_actual_listing_date": pl.Datetime("us"), + "_actual_listing_status": pl.Utf8, + "_actual_listing_features": pl.List(pl.Utf8), + "_actual_bedrooms": pl.Int32, + "_actual_bathrooms": pl.Int32, + "_actual_price_qualifier": pl.Utf8, + "_actual_property_sub_type": pl.Utf8, + "_actual_lat": pl.Float64, + "_actual_lon": pl.Float64, + "_actual_total_floor_area": pl.Float64, + "_actual_number_habitable_rooms": pl.Int16, + "_actual_property_type": pl.Utf8, + "_actual_leasehold_freehold": pl.Utf8, + }, + ) + + finalized = _finalize_listings(df) + + assert finalized.height == 1 + assert finalized["Listing URL"].to_list() == ["url0"] diff --git a/pipeline/transform/test_tree_density.py b/pipeline/transform/test_tree_density.py index a6e1b06..f4dd48f 100644 --- a/pipeline/transform/test_tree_density.py +++ b/pipeline/transform/test_tree_density.py @@ -1,19 +1,83 @@ +import math from pathlib import Path +import numpy as np import polars as pl import pytest +import shapely from pipeline.transform.tree_density import ( STREET_TREE_COVERAGE_COL, STREET_TREE_DENSITY_COL, + _add_nfi_batch, _coverage_percentile_expr, _metric_columns, + _postcode_buffers, _postcode_density_percentile_col, _with_postcode_density_percentiles, _write_street_rollups, ) +def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None: + radius_m = 50 + points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]}) + circles, tree = _postcode_buffers(points, radius_m) + buffer_area = math.pi * radius_m * radius_m + + # A large woodland square centred on postcode A fully covers A's circle. + canopy_area = np.zeros(2) + feature_count = np.zeros(2, dtype=np.uint32) + big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel + _add_nfi_batch( + np.array([big], dtype=object), + np.array(["Woodland"], dtype=object), + circles, + tree, + canopy_area, + feature_count, + radius_m, + ) + # Only the clipped circle area is added (the 32-gon buffer approximates the + # circle to ~1%), NOT the full 1,000,000 sqm polygon. + assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2) + assert canopy_area[0] <= buffer_area # never exceeds the buffer area + assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap + assert feature_count.tolist() == [1, 0] + + # A large parcel that only slivers into B's circle must add only the sliver, + # not its full area -- the failure mode the old centroid path could not avoid. + canopy_area = np.zeros(2) + feature_count = np.zeros(2, dtype=np.uint32) + sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle + _add_nfi_batch( + np.array([sliver], dtype=object), + np.array(["Woodland"], dtype=object), + circles, + tree, + canopy_area, + feature_count, + radius_m, + ) + assert canopy_area[0] == 0.0 + assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm + + # Non-woodland categories contribute nothing. + canopy_area = np.zeros(2) + feature_count = np.zeros(2, dtype=np.uint32) + _add_nfi_batch( + np.array([big], dtype=object), + np.array(["Non woodland"], dtype=object), + circles, + tree, + canopy_area, + feature_count, + radius_m, + ) + assert canopy_area.tolist() == [0.0, 0.0] + assert feature_count.tolist() == [0, 0] + + def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None: df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]}) diff --git a/pipeline/transform/tree_density.py b/pipeline/transform/tree_density.py index 146e7d2..06a16a7 100644 --- a/pipeline/transform/tree_density.py +++ b/pipeline/transform/tree_density.py @@ -1,10 +1,16 @@ -"""Derive street-scale tree density metrics from Forest Research TOW data. +"""Derive street-scale tree density metrics from Forest Research TOW + NFI data. The Forest Research Trees Outside Woodland release is an Esri File Geodatabase inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact postcode-level metric from the tree polygons, then optionally rolls that up to Price Paid street names so the dashboard can answer "what is this address's street like?" without loading the full geodatabase at runtime. + +TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI) +woodland layer is optionally unioned in. TOW canopy is accumulated by centroid +proximity (tiny crowns), while large NFI woodland parcels are accumulated by +true buffer-clipped intersection area so they cannot saturate a postcode from +mere centroid proximity. """ from __future__ import annotations @@ -22,7 +28,6 @@ import shapely from scipy.spatial import cKDTree -DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees") TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb" STREET_TREE_DENSITY_COL = "Street tree density percentile" STREET_TREE_COVERAGE_COL = "Street tree coverage (%)" @@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)" POSTCODE_COUNT_COL = "Tree features within {radius}m" POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)" +# National Forest Inventory (NFI) woodland — the geometric complement of TOW. +# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700. +# Field names are from the NFI Woodland England 2022 release; re-check on bumps. +NFI_CATEGORY_COL = "CATEGORY" +NFI_WOODLAND_VALUE = "Woodland" +NFI_TYPE_COL = "IFT_IOA" +NFI_AREA_HA_COL = "Area_ha" + def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path: """Extract the TOW zip and return the extracted .gdb path.""" @@ -83,12 +96,60 @@ def _tow_dataset_path( return str(_safe_extract_zip(zip_path, extract_dir, force_extract)) -def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None: - if not tow_types: - return None - escaped = [tow_type.replace("'", "''") for tow_type in tow_types] - values = ", ".join(f"'{tow_type}'" for tow_type in escaped) - return f"Woodland_Type IN ({values})" +def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path: + """Extract an arbitrary zip into extract_dir and return the directory.""" + if extract_dir.exists() and not force: + print(f"Using existing extraction directory: {extract_dir}") + return extract_dir + if extract_dir.exists(): + shutil.rmtree(extract_dir) + + tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp") + if tmp_dir.exists(): + shutil.rmtree(tmp_dir) + tmp_dir.mkdir(parents=True) + + root = tmp_dir.resolve() + print(f"Extracting {zip_path} to {extract_dir}...") + with zipfile.ZipFile(zip_path) as archive: + for member in archive.infolist(): + target = (tmp_dir / member.filename).resolve() + if root != target and root not in target.parents: + raise ValueError(f"Unsafe path in zip archive: {member.filename}") + if member.is_dir(): + target.mkdir(parents=True, exist_ok=True) + continue + target.parent.mkdir(parents=True, exist_ok=True) + with archive.open(member) as source, target.open("wb") as dest: + shutil.copyfileobj(source, dest, length=1024 * 1024) + + tmp_dir.rename(extract_dir) + print(f"Extracted archive: {extract_dir}") + return extract_dir + + +def _nfi_dataset_path( + zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool +) -> str: + """Resolve the NFI woodland shapefile path, extracting the zip if needed.""" + if use_vsizip: + return f"/vsizip/{zip_path.resolve()}" + extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract) + shapefiles = sorted(extracted.rglob("*.shp")) + if not shapefiles: + raise FileNotFoundError(f"No .shp found inside {zip_path}") + return str(shapefiles[0]) + + +def _geometry_column(metadata: dict, column_names: list[str]) -> str: + """Resolve the geometry column name from pyogrio Arrow metadata.""" + geometry_name = metadata.get("geometry_name") + if geometry_name: + return str(geometry_name) + for name in ("wkb_geometry", "geometry", "geom"): + if name in column_names: + return name + return column_names[-1] def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame: @@ -172,26 +233,20 @@ def _accumulate_tree_metrics( dataset_path: str, points: pl.DataFrame, radius_m: int, - tow_types: tuple[str, ...] | None, batch_size: int, layer_names: tuple[str, ...] | None, max_features_per_layer: int | None, workers: int, -) -> pl.DataFrame: + canopy_area: np.ndarray, + feature_count: np.ndarray, + height_weighted_sum: np.ndarray, + height_weight: np.ndarray, +) -> None: xy = points.select("x", "y").to_numpy() tree = cKDTree(xy) - n_points = points.height - canopy_area = np.zeros(n_points, dtype=np.float64) - feature_count = np.zeros(n_points, dtype=np.uint32) - height_weighted_sum = np.zeros(n_points, dtype=np.float64) - height_weight = np.zeros(n_points, dtype=np.float64) - - where = _where_for_tow_types(tow_types) layers = _layers(dataset_path, layer_names) print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}") - if where: - print(f"TOW type filter: {where}") columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"] total_features_seen = 0 @@ -206,7 +261,6 @@ def _accumulate_tree_metrics( dataset_path, layer=layer, columns=columns, - where=where, batch_size=batch_size, use_pyarrow=True, ) as (_meta, reader): @@ -297,6 +351,132 @@ def _accumulate_tree_metrics( f"{total_features_used:,} features with usable centroids" ) + +def _postcode_buffers( + points: pl.DataFrame, radius_m: int +) -> tuple[np.ndarray, shapely.STRtree]: + """Build a radius-r circle for every postcode plus an STRtree over them. + + Circle index == postcode index, matching the order used by the cKDTree path. + """ + xy = points.select("x", "y").to_numpy() + circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8) + return circles, shapely.STRtree(circles) + + +def _add_nfi_batch( + geoms: np.ndarray, + category: np.ndarray, + circles: np.ndarray, + tree: shapely.STRtree, + canopy_area: np.ndarray, + feature_count: np.ndarray, + radius_m: int, +) -> None: + """Add NFI woodland into the shared arrays by true buffer-clipped area. + + Unlike the TOW centroid path, this clips each woodland polygon to each + nearby postcode circle and adds only area(polygon ∩ circle); a large parcel + therefore cannot saturate a postcode from mere centroid proximity, and a + buffer-filling parcel whose centroid is outside the radius is not missed. + """ + keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms) + geoms = geoms[keep] + if geoms.size: + geoms = geoms[~shapely.is_empty(geoms)] + if geoms.size == 0: + return + + # dwithin(polygon, point, r) is true iff the radius-r circle around the + # point intersects the polygon -- exactly the candidate set we want. + nfi_index, postcode_index = tree.query( + geoms, predicate="dwithin", distance=radius_m + ) + if nfi_index.size == 0: + return + + clipped_area = shapely.area( + shapely.intersection(geoms[nfi_index], circles[postcode_index]) + ) + positive = clipped_area > 0 + postcode_index = postcode_index[positive] + clipped_area = clipped_area[positive] + + np.add.at(canopy_area, postcode_index, clipped_area) + np.add.at(feature_count, postcode_index, 1) + + +def _accumulate_nfi_metrics( + dataset_path: str, + circles: np.ndarray, + tree: shapely.STRtree, + canopy_area: np.ndarray, + feature_count: np.ndarray, + radius_m: int, + batch_size: int, + max_nfi_features: int | None, +) -> None: + layers = _layers(dataset_path, None) + print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}") + + # Density only needs the woodland flag + geometry; area is clipped from the + # postcode buffer, not read from the file. + columns = [NFI_CATEGORY_COL] + features_seen = 0 + + for layer in layers: + with pyogrio.open_arrow( + dataset_path, + layer=layer, + columns=columns, + batch_size=batch_size, + use_pyarrow=True, + ) as (meta, reader): + for batch_index, batch in enumerate(reader, start=1): + if max_nfi_features is not None: + remaining = max_nfi_features - features_seen + if remaining <= 0: + break + if batch.num_rows > remaining: + batch = batch.slice(0, remaining) + + features_seen += batch.num_rows + names = batch.schema.names + geometry_column = _geometry_column(meta, names) + category = np.asarray( + batch.column(names.index(NFI_CATEGORY_COL)).to_numpy( + zero_copy_only=False + ), + dtype=object, + ) + geometry = np.asarray( + batch.column(names.index(geometry_column)).to_numpy( + zero_copy_only=False + ), + dtype=object, + ) + _add_nfi_batch( + shapely.from_wkb(geometry), + category, + circles, + tree, + canopy_area, + feature_count, + radius_m, + ) + if batch_index == 1 or batch_index % 25 == 0: + print(f" NFI batch {batch_index:,}: {features_seen:,} rows read") + + +def _finalize_metrics( + points: pl.DataFrame, + canopy_area: np.ndarray, + feature_count: np.ndarray, + height_weighted_sum: np.ndarray, + height_weight: np.ndarray, + radius_m: int, +) -> pl.DataFrame: + n_points = points.height density_col, area_col, count_col, height_col = _metric_columns(radius_m) buffer_area = math.pi * radius_m * radius_m density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0) @@ -518,6 +698,18 @@ def main() -> None: action="store_true", help="Read the geodatabase directly from the zip instead of extracting it", ) + parser.add_argument( + "--nfi-zip", + type=Path, + default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"), + help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)", + ) + parser.add_argument( + "--nfi-extract-dir", + type=Path, + default=Path("property-data/nfi_woodland_england"), + help="Directory where the NFI zip is extracted", + ) parser.add_argument( "--arcgis", type=Path, @@ -554,11 +746,6 @@ def main() -> None: default=50, help="Radius around each postcode centroid used as the street-scale buffer", ) - parser.add_argument( - "--tow-types", - default=",".join(DEFAULT_TOW_TYPES), - help='Comma-separated Woodland_Type values to include, or "all"', - ) parser.add_argument( "--layers", default=None, @@ -588,6 +775,12 @@ def main() -> None: default=None, help="Testing only: process at most N TOW features per layer", ) + parser.add_argument( + "--max-nfi-features", + type=int, + default=None, + help="Testing only: process at most N NFI woodland features", + ) args = parser.parse_args() if (args.output_streets or args.output_addresses) and args.price_paid is None: @@ -600,18 +793,53 @@ def main() -> None: args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip ) points = _postcode_points(args.arcgis, args.max_postcodes) - tow_types = _parse_csv_arg(args.tow_types) layer_names = _parse_csv_arg(args.layers) - postcode_metrics = _accumulate_tree_metrics( + n_points = points.height + canopy_area = np.zeros(n_points, dtype=np.float64) + feature_count = np.zeros(n_points, dtype=np.uint32) + height_weighted_sum = np.zeros(n_points, dtype=np.float64) + height_weight = np.zeros(n_points, dtype=np.float64) + + _accumulate_tree_metrics( dataset_path=dataset_path, points=points, radius_m=args.radius_m, - tow_types=tow_types, batch_size=args.batch_size, layer_names=layer_names, max_features_per_layer=args.max_features_per_layer, workers=args.workers, + canopy_area=canopy_area, + feature_count=feature_count, + height_weighted_sum=height_weighted_sum, + height_weight=height_weight, + ) + + if args.nfi_zip is not None and args.nfi_zip.exists(): + nfi_path = _nfi_dataset_path( + args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip + ) + circles, nfi_tree = _postcode_buffers(points, args.radius_m) + _accumulate_nfi_metrics( + dataset_path=nfi_path, + circles=circles, + tree=nfi_tree, + canopy_area=canopy_area, + feature_count=feature_count, + radius_m=args.radius_m, + batch_size=args.batch_size, + max_nfi_features=args.max_nfi_features, + ) + elif args.nfi_zip is not None: + print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}") + + postcode_metrics = _finalize_metrics( + points, + canopy_area, + feature_count, + height_weighted_sum, + height_weight, + args.radius_m, ) postcode_metrics = _with_postcode_density_percentiles( postcode_metrics, args.radius_m diff --git a/pipeline/transform/tree_overlay_tiles.py b/pipeline/transform/tree_overlay_tiles.py index 56ef6ee..b301066 100644 --- a/pipeline/transform/tree_overlay_tiles.py +++ b/pipeline/transform/tree_overlay_tiles.py @@ -1,4 +1,4 @@ -"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay.""" +"""Build PMTiles polygon tiles for the Trees Outside Woodland + NFI overlay.""" from __future__ import annotations @@ -16,10 +16,14 @@ from pyproj import Transformer from pipeline.local_temp import local_tmp_dir from pipeline.transform.tree_density import ( - DEFAULT_TOW_TYPES, + NFI_AREA_HA_COL, + NFI_CATEGORY_COL, + NFI_TYPE_COL, + NFI_WOODLAND_VALUE, + _geometry_column, _layers, + _nfi_dataset_path, _tow_dataset_path, - _where_for_tow_types, ) @@ -55,17 +59,13 @@ def _number_or_none(value) -> float | int | None: def _write_tree_geojsonseq( dataset_path: str, output_path: Path, - tow_types: tuple[str, ...], batch_size: int, layer_names: tuple[str, ...] | None, max_features_per_layer: int | None, ) -> int: to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) - where = _where_for_tow_types(tow_types) layers = _layers(dataset_path, layer_names) print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}") - if where: - print(f"TOW type filter: {where}") columns = [ "TOW_ID", @@ -88,7 +88,6 @@ def _write_tree_geojsonseq( dataset_path, layer=layer, columns=columns, - where=where, batch_size=batch_size, use_pyarrow=True, ) as (_meta, reader): @@ -136,6 +135,7 @@ def _write_tree_geojsonseq( for idx, geometry_json in zip(valid_indexes, geometries_json): properties = { + "source": "tow", "tow_id": str(tow_id[idx]) if tow_id is not None else "", "woodland_type": ( str(woodland_type[idx]) @@ -176,11 +176,105 @@ def _write_tree_geojsonseq( return feature_count +def _append_nfi_geojsonseq( + dataset_path: str, + output_path: Path, + batch_size: int, + max_nfi_features: int | None, +) -> int: + """Append NFI woodland polygons to the same GeoJSONSeq as the TOW features.""" + to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) + layers = _layers(dataset_path, None) + print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}") + + columns = [NFI_CATEGORY_COL, NFI_TYPE_COL, NFI_AREA_HA_COL] + feature_count = 0 + features_seen = 0 + + with output_path.open("a") as file: + for layer in layers: + with pyogrio.open_arrow( + dataset_path, + layer=layer, + columns=columns, + batch_size=batch_size, + use_pyarrow=True, + ) as (meta, reader): + for batch in reader: + if max_nfi_features is not None: + remaining = max_nfi_features - features_seen + if remaining <= 0: + break + if batch.num_rows > remaining: + batch = batch.slice(0, remaining) + + features_seen += batch.num_rows + names = batch.schema.names + geometry_column = _geometry_column(meta, names) + category = np.asarray( + batch.column(names.index(NFI_CATEGORY_COL)).to_numpy( + zero_copy_only=False + ), + dtype=object, + ) + geometry = np.asarray( + batch.column(names.index(geometry_column)).to_numpy( + zero_copy_only=False + ), + dtype=object, + ) + valid = category == NFI_WOODLAND_VALUE + if not valid.any(): + continue + + woodland_type = _column_or_none(batch, names, NFI_TYPE_COL) + area_ha = _column_or_none(batch, names, NFI_AREA_HA_COL) + + geometries = shapely.from_wkb(geometry[valid]) + geometries = shapely.transform( + geometries, + to_wgs84.transform, + interleaved=False, + ) + geometries_json = shapely.to_geojson(geometries) + valid_indexes = np.flatnonzero(valid) + + for idx, geometry_json in zip(valid_indexes, geometries_json): + area_sqm = ( + _number_or_none(area_ha[idx] * 10000.0) + if area_ha is not None + else None + ) + properties = { + "source": "nfi", + "tow_id": "", + "woodland_type": ( + str(woodland_type[idx]) + if woodland_type is not None + else "" + ), + "area_sqm": area_sqm, + "mean_height_m": None, + "min_height_m": None, + "max_height_m": None, + "lidar_year": None, + "source_layer": layer, + } + feature = { + "type": "Feature", + "geometry": json.loads(geometry_json), + "properties": properties, + } + file.write(json.dumps(feature, separators=(",", ":")) + "\n") + feature_count += 1 + + return feature_count + + def build_tree_overlay_tiles( tow_zip: Path, output_path: Path, extract_dir: Path, - tow_types: tuple[str, ...], batch_size: int, layer_names: tuple[str, ...] | None, max_features_per_layer: int | None, @@ -188,6 +282,9 @@ def build_tree_overlay_tiles( max_zoom: int, force_extract: bool, use_vsizip: bool, + nfi_zip: Path | None = None, + nfi_extract_dir: Path = Path("property-data/nfi_woodland_england"), + max_nfi_features: int | None = None, ) -> None: tippecanoe = _require_tippecanoe() dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip) @@ -198,13 +295,26 @@ def build_tree_overlay_tiles( feature_count = _write_tree_geojsonseq( dataset_path, ndjson_path, - tow_types, batch_size, layer_names, max_features_per_layer, ) print(f"Writing {feature_count:,} TOW polygon features") + if nfi_zip is not None and nfi_zip.exists(): + nfi_path = _nfi_dataset_path( + nfi_zip, nfi_extract_dir, force_extract, use_vsizip + ) + nfi_count = _append_nfi_geojsonseq( + nfi_path, + ndjson_path, + batch_size, + max_nfi_features, + ) + print(f"Writing {nfi_count:,} NFI woodland polygon features") + elif nfi_zip is not None: + print(f"NFI zip not found, skipping woodland union: {nfi_zip}") + subprocess.run( [ tippecanoe, @@ -237,26 +347,32 @@ def main() -> None: default=Path("property-data/fr_tow_v1_all"), help="Directory used to extract the FileGDB", ) - parser.add_argument( - "--tow-type", - action="append", - dest="tow_types", - help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.", - ) parser.add_argument("--batch-size", type=int, default=50_000) parser.add_argument("--layer", action="append", dest="layers") parser.add_argument("--max-features-per-layer", type=int) - parser.add_argument("--min-zoom", type=int, default=15) + parser.add_argument("--min-zoom", type=int, default=12) parser.add_argument("--max-zoom", type=int, default=17) parser.add_argument("--force-extract", action="store_true") parser.add_argument("--use-vsizip", action="store_true") + parser.add_argument( + "--nfi-zip", + type=Path, + default=None, + help="Optional NFI woodland shapefile zip to union into the overlay", + ) + parser.add_argument( + "--nfi-extract-dir", + type=Path, + default=Path("property-data/nfi_woodland_england"), + help="Directory used to extract the NFI zip", + ) + parser.add_argument("--max-nfi-features", type=int) args = parser.parse_args() build_tree_overlay_tiles( tow_zip=args.tow_zip, output_path=args.output, extract_dir=args.extract_dir, - tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES), batch_size=args.batch_size, layer_names=tuple(args.layers) if args.layers else None, max_features_per_layer=args.max_features_per_layer, @@ -264,6 +380,9 @@ def main() -> None: max_zoom=args.max_zoom, force_extract=args.force_extract, use_vsizip=args.use_vsizip, + nfi_zip=args.nfi_zip, + nfi_extract_dir=args.nfi_extract_dir, + max_nfi_features=args.max_nfi_features, ) diff --git a/pipeline/validate_outputs.py b/pipeline/validate_outputs.py index 031a24d..69ed76e 100644 --- a/pipeline/validate_outputs.py +++ b/pipeline/validate_outputs.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse +import json import sys import zipfile from pathlib import Path @@ -76,6 +77,24 @@ def _split_glob(spec: str) -> tuple[Path, str]: return Path(base), pattern +def _split_pair(spec: str, label: str) -> tuple[Path, Path]: + if "::" not in spec: + raise argparse.ArgumentTypeError( + f"{spec!r} must use LEFT::RIGHT for {label}" + ) + left, right = spec.split("::", 1) + if not left or not right: + raise argparse.ArgumentTypeError(f"{spec!r} must include both paths") + return Path(left), Path(right) + + +def _canonical_postcode(value: object) -> str: + compact = "".join(str(value).split()).upper() + if len(compact) >= 5: + return f"{compact[:-3]} {compact[-3:]}" + return compact + + def _matched_files(spec: str) -> tuple[Path, str, list[Path]]: base, pattern = _split_glob(spec) if not base.exists(): @@ -105,6 +124,79 @@ def _failures_for_zip_glob(spec: str) -> list[str]: return failures +def _postcode_column(columns: list[str]) -> str | None: + for name in ("postcode", "Postcode", "pcds", "PCDS"): + if name in columns: + return name + return None + + +def _parquet_postcodes(path: Path) -> set[str]: + schema = pl.scan_parquet(path).collect_schema() + column = _postcode_column(schema.names()) + if column is None: + raise ValueError(f"{path}: missing postcode column") + values = ( + pl.scan_parquet(path) + .select(pl.col(column).drop_nulls().unique()) + .collect() + .get_column(column) + .to_list() + ) + return {_canonical_postcode(value) for value in values if _canonical_postcode(value)} + + +def _boundary_postcodes(path: Path) -> set[str]: + units_dir = path / "units" if (path / "units").is_dir() else path + postcodes: set[str] = set() + for geojson_path in sorted(units_dir.glob("*.geojson")): + with geojson_path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + for feature in data.get("features", []): + properties = feature.get("properties") or {} + value = properties.get("postcodes") + if value is not None: + postcode = _canonical_postcode(value) + if postcode: + postcodes.add(postcode) + return postcodes + + +def _sample(values: set[str]) -> str: + return ", ".join(sorted(values)[:10]) + + +def _failures_for_postcode_boundary_match(spec: str) -> list[str]: + parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching") + failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path) + if failures: + return failures + + try: + parquet_postcodes = _parquet_postcodes(parquet_path) + boundary_postcodes = _boundary_postcodes(boundaries_path) + except Exception as exc: + return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"] + + failures = [] + if not boundary_postcodes: + failures.append(f"{boundaries_path}: no boundary postcodes found") + + missing_boundaries = parquet_postcodes - boundary_postcodes + orphan_boundaries = boundary_postcodes - parquet_postcodes + if missing_boundaries: + failures.append( + f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} " + f"are missing boundaries; sample: {_sample(missing_boundaries)}" + ) + if orphan_boundaries: + failures.append( + f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from " + f"{parquet_path}; sample: {_sample(orphan_boundaries)}" + ) + return failures + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--file", action="append", default=[], type=Path) @@ -123,6 +215,12 @@ def main() -> int: default=[], help="Require at least one readable zip matching BASE::PATTERN", ) + parser.add_argument( + "--postcode-boundary-match", + action="append", + default=[], + help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR", + ) args = parser.parse_args() failures: list[str] = [] @@ -138,6 +236,8 @@ def main() -> int: failures.extend(_failures_for_glob(spec)) for spec in args.zip_glob: failures.extend(_failures_for_zip_glob(spec)) + for spec in args.postcode_boundary_match: + failures.extend(_failures_for_postcode_boundary_match(spec)) if failures: print("Output validation failed:", file=sys.stderr)