scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

2
.gitignore vendored
View file

@ -22,6 +22,8 @@ video/auth.*
*.jpeg *.jpeg
*.mp4 *.mp4
**/*.log
r5-java/tmp r5-java/tmp
property-data property-data
property-data2 property-data2

25
finder/Dockerfile Normal file
View file

@ -0,0 +1,25 @@
# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
# network namespace; the source tree is bind-mounted at runtime, so this image
# only needs the Python deps. The venv lives OUTSIDE the bind-mount target
# (/opt/venv) so the mount doesn't shadow it.
FROM python:3.12-slim
ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy \
PYTHONUNBUFFERED=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates curl \
&& rm -rf /var/lib/apt/lists/*
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
WORKDIR /app/finder
# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
COPY pyproject.toml uv.lock ./
RUN uv sync --no-install-project --frozen
# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
CMD ["sleep", "infinity"]

View file

@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data"))) DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
ARCGIS_PATH = Path( ARCGIS_PATH = Path(
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")) os.environ.get(
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
)
) )
PAGE_SIZE = 24 PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 0.3 DELAY_BETWEEN_PAGES = 0.3
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsi
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk" RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
# propertyData.address.{outcode,incode}, which together form the property's TRUE
# full postcode — the search API only exposes the outcode. {id} is the numeric
# listing id from the search response.
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
# The Rightmove search API gives only an outcode-level display address, so the
# true full postcode is recovered from each listing's detail page (see
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
# big throughput increase over the ~1000-result-per-outcode search, so detail
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
# OnTheMarket # OnTheMarket
ONTHEMARKET_BASE = "https://www.onthemarket.com" ONTHEMARKET_BASE = "https://www.onthemarket.com"
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
# Zoopla # Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk" ZOOPLA_BASE = "https://www.zoopla.co.uk"
# Zoopla search cards only carry an outcode-level address, so the full postcode
# and precise coordinates are scraped from each listing's detail page. These
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
# spent fetching details; the remainder is reserved for search pagination so
# detail fetches can never trip the timeout and discard collected listings.
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
# Gluetun VPN. Network endpoints are env-overridable because they are
# deployment-specific: when finder runs in a SEPARATE container they use the
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
# all traffic, so no HTTP proxy is needed).
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
# Raise only with residential IPs where rotation helps.
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
# Greater London-ish postcode areas. This intentionally uses broad area # Greater London-ish postcode areas. This intentionally uses broad area
# prefixes so a manual scrape can include central/inner London plus common # prefixes so a manual scrape can include central/inner London plus common
# outer-London and near-London outcodes without maintaining a long borough list. # outer-London and near-London outcodes without maintaining a long borough list.

57
finder/docker-compose.yml Normal file
View file

@ -0,0 +1,57 @@
# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
# container's network namespace. Everything egresses through the VPN, and
# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
#
# Prerequisites:
# - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
# It is managed by a different compose; it is referenced here as external
# via network_mode "container:media_gluetun".
# - Because these services share gluetun's netns, they reach each other and
# gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
# NO published ports (which is exactly why this avoids the dev-container
# port-forwarding pain).
#
# Usage:
# cd finder
# docker compose up -d --build flaresolverr finder # start the sidecars
# docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
# docker compose exec finder uv run python main.py --source all # full run
# docker compose down
#
# NOTE: a manually-started `finder_flaresolverr` container from testing must be
# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
services:
flaresolverr:
image: ghcr.io/flaresolverr/flaresolverr:latest
container_name: finder_flaresolverr
network_mode: "container:media_gluetun"
environment:
LOG_LEVEL: info
TZ: Europe/London
restart: unless-stopped
finder:
build:
context: .
dockerfile: Dockerfile
image: finder-scraper:latest
container_name: finder_scraper
network_mode: "container:media_gluetun"
depends_on:
- flaresolverr
volumes:
- .:/app/finder # live-mounted finder source
- ../property-data:/app/property-data:ro # ARCGIS postcode data
working_dir: /app/finder
environment:
# Shared netns: sidecars are on localhost, and the netns already tunnels
# all traffic through the VPN, so no HTTP proxy is used.
ZOOPLA_FETCHER: flaresolverr
FLARESOLVERR_URL: http://localhost:8191/v1
GLUETUN_CONTROL_URL: http://localhost:8000
GLUETUN_PROXY: "" # empty => direct (shared netns already tunnels)
DATA_DIR: /app/finder/data
ARCGIS_PATH: /app/property-data/arcgis_data.parquet
restart: "no"
command: ["sleep", "infinity"] # stays up; run scrapes via `docker compose exec`

91
finder/flaresolverr.py Normal file
View file

@ -0,0 +1,91 @@
"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
undetected browser to pass Cloudflare's challenge and returns the fully
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
the Gluetun VPN network namespace, so its browser egresses through the VPN.
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
provided a reused session and a generous maxTimeout (~120s) the first
challenge solve is slow, subsequent requests on the warm session are fast.
"""
import logging
import httpx
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
log = logging.getLogger("flaresolverr")
class FlareSolverrError(Exception):
"""Raised when FlareSolverr cannot fetch/solve a URL."""
class FlareSolverrSession:
"""A reusable FlareSolverr browser session (context manager).
Reusing one session keeps the cleared Cloudflare cookies warm across
requests, so only the first fetch pays the full challenge-solve cost."""
def __init__(
self,
url: str = FLARESOLVERR_URL,
session: str = "finder",
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
) -> None:
self._url = url
self._session = session
self._max_timeout = max_timeout_ms
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
# for up to maxTimeout while solving before responding).
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
self._active = False
def _post(self, payload: dict) -> dict:
try:
resp = self._client.post(self._url, json=payload)
resp.raise_for_status()
data = resp.json()
except (httpx.HTTPError, ValueError) as exc:
raise FlareSolverrError(
f"FlareSolverr request to {self._url} failed: {exc}"
) from exc
if data.get("status") != "ok":
raise FlareSolverrError(
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
)
return data
def __enter__(self) -> "FlareSolverrSession":
# Start from a clean session (ignore destroy errors for a fresh name).
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError:
pass
self._post({"cmd": "sessions.create", "session": self._session})
self._active = True
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
return self
def get(self, url: str) -> str:
"""Fetch a URL through FlareSolverr; return the solved HTML."""
data = self._post(
{
"cmd": "request.get",
"session": self._session,
"url": url,
"maxTimeout": self._max_timeout,
}
)
solution = data.get("solution") or {}
return solution.get("response", "") or ""
def __exit__(self, *exc_info) -> None:
if self._active:
try:
self._post({"cmd": "sessions.destroy", "session": self._session})
except FlareSolverrError as exc:
log.debug("FlareSolverr session destroy failed: %s", exc)
self._client.close()

View file

@ -0,0 +1,53 @@
# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
#
# EA VAP ships as ECW **v2** rasters, which are readable by the open-source
# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
# restrictions entirely.
#
# We build only the ECW driver as a GDAL *plugin* on top of the official runtime
# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
# commit reported by the base image so libgdal and the plugin stay ABI-compatible.
#
# Build: docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
RUN apt-get update && apt-get install -y --no-install-recommends \
cmake g++ make git curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
&& tar -C / -xzf /tmp/libecwj2.tar.gz \
&& rm -f /tmp/libecwj2.tar.gz \
&& (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
done) \
&& ldconfig
# Build the ECW driver plugin against the base image's exact GDAL sources.
RUN set -eux; \
GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
test -n "$GDAL_COMMIT"; \
echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
git checkout -q FETCH_HEAD; \
cmake -S frmts/ecw -B /tmp/ecw-build \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH=/usr \
-DECW_ROOT=/opt/libecwj2-3.3; \
cmake --build /tmp/ecw-build -j"$(nproc)"; \
PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
mkdir -p "$PLUGIN_DIR"; \
find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
rm -rf /tmp/gdal /tmp/ecw-build
# Fail the build if the driver is not actually available.
RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"

View file

@ -5,7 +5,7 @@ import time
import httpx import httpx
from fake_useragent import UserAgent from fake_useragent import UserAgent
from constants import MAX_RETRIES, RETRY_BASE_DELAY from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY
log = logging.getLogger("rightmove") log = logging.getLogger("rightmove")
@ -15,10 +15,12 @@ _ua = UserAgent(
def make_client() -> httpx.Client: def make_client() -> httpx.Client:
# Route through the Gluetun HTTP proxy (VPN egress) when configured.
return httpx.Client( return httpx.Client(
timeout=30, timeout=30,
headers={"User-Agent": _ua.random, "Accept": "application/json"}, headers={"User-Agent": _ua.random, "Accept": "application/json"},
follow_redirects=True, follow_redirects=True,
proxy=GLUETUN_PROXY or None,
) )

View file

@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
default=DATA_DIR, default=DATA_DIR,
help=f"Directory for parquet output. Defaults to {DATA_DIR}.", help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
) )
parser.add_argument(
"--outcodes",
type=str,
default=None,
help=(
"Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
"instead of the full Greater London set. Must fall within the "
"London-ish areas; takes precedence over --test/--limit-outcodes."
),
)
parser.add_argument( parser.add_argument(
"--limit-outcodes", "--limit-outcodes",
type=int, type=int,
@ -116,17 +126,32 @@ def main() -> int:
from scraper import ( from scraper import (
build_postcode_coords, build_postcode_coords,
build_postcode_index, build_postcode_index,
filter_londonish_outcodes,
load_outcodes, load_outcodes,
run_scrape, run_scrape,
) )
outcodes = load_outcodes() if args.outcodes is not None:
if args.test and args.limit_outcodes is None: requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)] if not requested:
if preferred: raise SystemExit("--outcodes was empty")
outcodes = preferred outcodes = filter_londonish_outcodes(requested)
if args.limit_outcodes is not None: dropped = sorted(set(requested) - set(outcodes))
outcodes = outcodes[: args.limit_outcodes] if dropped:
log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
if not outcodes:
raise SystemExit(
"None of the requested outcodes are within the Greater London-ish areas "
f"({', '.join(requested)})."
)
else:
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if not outcodes: if not outcodes:
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.") raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")

View file

@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
`humanised-property-type`, `features` (a list where the first element is `humanised-property-type`, `features` (a list where the first element is
typically `"Tenure: <value>"`), and `details-url`. Pagination is via typically `"Tenure: <value>"`), and `details-url`. Pagination is via
`?page=N`; the loop terminates when `paginationControls.next` is null. `?page=N`; the loop terminates when `paginationControls.next` is null.
Postcodes
---------
The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
London, SE5") and a map pin, so the old behaviour derived the postcode from the
nearest postcode to that pin a guess that frequently lands on a neighbouring
unit (the pin can sit on the wrong side of a street boundary).
Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
embeds the property's analytics dataLayer at
`props.initialReduxState.metadata.dataLayer`, which carries the property's own
`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
`property-id`. Crucially this is NOT the agent's office postcode — that lives
separately at `property.agent.postcode` ("SE5 8RS" for the same listing) and
is the classic trap when blindly scanning the page for a postcode. We read the
dataLayer postcode, verify `property-id` matches the listing, and accept it only
when its outcode agrees with the coordinate-nearest postcode (via
``resolve_listing_postcode``) exactly the trust rule the other scrapers use.
Measured over a sample of real listings this yields a trustworthy, usually
exact-unit postcode for ~11/12 listings; the rest safely fall back to the
coordinate-nearest postcode.
Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
""" """
import json import json
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
from transform import ( from transform import (
clean_listing_address, clean_listing_address,
extract_full_postcode, extract_full_postcode,
extract_outcode,
fix_coords, fix_coords,
map_property_type, map_property_type,
normalize_sub_type, normalize_sub_type,
parse_display_size, parse_display_size,
resolve_listing_postcode,
) )
log = logging.getLogger("rightmove") log = logging.getLogger("rightmove")
# Detail-page postcode recovery (see module docstring). When enabled, each
# listing's detail page is fetched so its analytics dataLayer postcode — the
# property's own full unit postcode — can replace the coordinate-nearest guess.
# Bounded per outcode so a large outcode can't balloon into unbounded extra
# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
# typical outcode's listings all get their real postcode rather than a
# coordinate-nearest guess.
OTM_FETCH_DETAILS = True
OTM_MAX_DETAILS_PER_OUTCODE = 400
_NEXT_DATA_RE = re.compile( _NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
re.DOTALL, re.DOTALL,
@ -51,6 +87,11 @@ _HTML_HEADERS = {
"Accept-Language": "en-GB,en;q=0.9", "Accept-Language": "en-GB,en;q=0.9",
} }
# listingId -> recovered full postcode (or None). Failures are cached too so a
# broken or postcode-less detail page is not re-fetched within a run (the same
# listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None: def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON. """GET one search-results page and return the embedded __NEXT_DATA__ JSON.
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
return None return None
def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
"""Extract the property's own full postcode from an OnTheMarket detail page.
Pure and network-free so it is unit-testable: callers pass `page.content()`
/ the GET body and this does the parsing.
The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
``props.initialReduxState.metadata.dataLayer.postcode`` and is the
property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
agent's office postcode, which sits separately at
``property.agent.postcode`` the trap when scanning a detail page for "a"
postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
match it, guaranteeing we read this listing's postcode and not a stray one.
Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
page has no usable property postcode. Trust (outcode-vs-coordinates
agreement) is enforced later in ``transform_property``.
"""
if not html:
return None
match = _NEXT_DATA_RE.search(html)
if not match:
return None
try:
data = json.loads(match.group(1))
except json.JSONDecodeError:
return None
try:
data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
except (KeyError, TypeError):
return None
if not isinstance(data_layer, dict):
return None
# Guard against reading a different listing's postcode: the dataLayer is the
# property's own analytics payload, so its property-id must match.
if listing_id is not None:
page_id = data_layer.get("property-id")
if page_id is not None and str(page_id) != str(listing_id):
return None
raw_postcode = data_layer.get("postcode")
if not isinstance(raw_postcode, str):
return None
return extract_full_postcode(raw_postcode)
def _fetch_detail_postcode(
client: httpx.Client, details_url: str, listing_id: str
) -> str | None:
"""GET one listing's detail page and return its dataLayer postcode (or None).
Results (including failures) are cached by listing id so a listing that
reappears across overlapping outcode searches is fetched at most once. Plain
HTTPS GET OnTheMarket detail pages have no Cloudflare challenge. Network /
parse errors degrade gracefully to None so the caller falls back to the
coordinate-nearest postcode.
"""
if listing_id in _detail_postcode_cache:
return _detail_postcode_cache[listing_id]
full_url = (
ONTHEMARKET_BASE + details_url
if details_url and not details_url.startswith("http")
else details_url
)
result: str | None = None
if full_url:
for attempt in range(MAX_RETRIES):
try:
resp = client.get(
full_url, headers=_HTML_HEADERS, follow_redirects=True
)
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as exc:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
if resp.status_code == 200:
result = parse_detail_postcode(resp.text, listing_id)
break
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
log.debug(
"OnTheMarket detail %s returned HTTP %d (no postcode)",
listing_id, resp.status_code,
)
break
_detail_postcode_cache[listing_id] = result
return result
def _parse_price(price_value) -> int: def _parse_price(price_value) -> int:
"""Parse a formatted price string like '£450,000' into an integer. """Parse a formatted price string like '£450,000' into an integer.
Returns 0 for POA/auction/null values.""" Returns 0 for POA/auction/null values."""
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:
def transform_property( def transform_property(
raw: dict, pc_index: PostcodeSpatialIndex raw: dict,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None: ) -> dict | None:
"""Transform a raw OnTheMarket listing dict into our output schema.""" """Transform a raw OnTheMarket listing dict into our output schema.
``detail_postcode`` is the property's own full postcode recovered from its
detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
or ``None`` when no detail fetch was done / no postcode was found. When
present and trustworthy (its outcode agrees with the coordinate-nearest
postcode) it supersedes the coordinate guess and is labelled
``"detail_address"``.
"""
loc = raw.get("location") or {} loc = raw.get("location") or {}
raw_lat = loc.get("lat") raw_lat = loc.get("lat")
raw_lng = loc.get("lon") raw_lng = loc.get("lon")
@ -184,8 +345,29 @@ def transform_property(
return None return None
raw_address = raw.get("address", "") or "" raw_address = raw.get("address", "") or ""
extracted_postcode = extract_full_postcode(raw_address) extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates" # Prefer the property's own detail-page postcode when we have one and it is
# trustworthy. The detail postcode is a full unit postcode (better than the
# coordinate-nearest guess and than the usually outcode-only card address),
# but a stale/mislabelled value would silently override the spatially
# correct one, so apply the same outcode-agreement trust rule the address
# postcode uses: keep it only when its outcode matches the
# coordinate-nearest postcode's outcode.
detail_postcode = extract_full_postcode(detail_postcode)
if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_postcode, "detail_address"
else:
if detail_postcode:
log.debug(
"OnTheMarket %s: rejecting detail postcode %s "
"(outcode mismatch with inferred %s)",
raw.get("id", "?"), detail_postcode, inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
raw_beds = raw.get("bedrooms") or 0 raw_beds = raw.get("bedrooms") or 0
raw_baths = raw.get("bathrooms") or 0 raw_baths = raw.get("bathrooms") or 0
@ -223,6 +405,10 @@ def transform_property(
"Inferred postcode": inferred_postcode, "Inferred postcode": inferred_postcode,
"Listing raw address": raw_address, "Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address), "Address per Property Register": clean_listing_address(raw_address),
# OnTheMarket search JSON exposes only a street-level address; no UPRN
# or house number/name is available without a detail-page fetch.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": _extract_tenure(features), "Leasehold/Freehold": _extract_tenure(features),
"Property type": map_property_type(sub_type), "Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type), "Property sub-type": normalize_sub_type(sub_type),
@ -242,10 +428,17 @@ def search_outcode(
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
max_properties: int | None = None, max_properties: int | None = None,
) -> list[dict]: ) -> list[dict]:
"""Paginate through OnTheMarket sale results for one outcode.""" """Paginate through OnTheMarket sale results for one outcode.
When ``OTM_FETCH_DETAILS`` is enabled, up to
``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
fetched for the property's own postcode (see ``_fetch_detail_postcode``);
the rest fall back to the coordinate-nearest postcode.
"""
properties: list[dict] = [] properties: list[dict] = []
seen_ids: set[str] = set() seen_ids: set[str] = set()
page_num = 1 page_num = 1
details_fetched = 0
while True: while True:
data = _fetch_page_json(client, outcode, page_num) data = _fetch_page_json(client, outcode, page_num)
@ -269,8 +462,22 @@ def search_outcode(
if listing_id and listing_id in seen_ids: if listing_id and listing_id in seen_ids:
continue continue
seen_ids.add(listing_id) seen_ids.add(listing_id)
detail_postcode = None
if OTM_FETCH_DETAILS and listing_id:
# Cached lookups are free; only fresh GETs count toward the cap
# and incur the inter-request delay.
cached = listing_id in _detail_postcode_cache
if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
detail_postcode = _fetch_detail_postcode(
client, raw.get("details-url") or "", listing_id
)
if not cached:
details_fetched += 1
time.sleep(DELAY_BETWEEN_PAGES)
try: try:
transformed = transform_property(raw, pc_index) transformed = transform_property(raw, pc_index, detail_postcode)
except Exception as exc: except Exception as exc:
log.warning( log.warning(
"OnTheMarket %s property %s failed to transform: %s", "OnTheMarket %s property %s failed to transform: %s",

View file

@ -1,4 +1,6 @@
import json
import logging import logging
import re
import time import time
import httpx import httpx
@ -6,12 +8,15 @@ import httpx
from constants import ( from constants import (
PAGE_SIZE, PAGE_SIZE,
DELAY_BETWEEN_PAGES, DELAY_BETWEEN_PAGES,
RIGHTMOVE_DETAIL_URL,
RIGHTMOVE_FETCH_DETAILS,
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
SEARCH_URL, SEARCH_URL,
TYPEAHEAD_URL, TYPEAHEAD_URL,
) )
from http_client import fetch_with_retry from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex from spatial import PostcodeSpatialIndex
from transform import transform_property from transform import extract_full_postcode, normalize_postcode, transform_property
log = logging.getLogger("rightmove") log = logging.getLogger("rightmove")
@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
_MAX_INDEX = 1008 _MAX_INDEX = 1008
# ---------------------------------------------------------------------------
# Detail-page postcode extraction
# ---------------------------------------------------------------------------
#
# The search API (_paginate) only returns an outcode-level `displayAddress`
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
# listing's detail page, however, embeds the property's OWN full postcode in a
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
# This is the property's own postcode, NOT a nearest station/school: the
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
# postcodes, and the address outcode always matches the searched outcode.
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
# new-builds) found the full postcode present 100% of the time. There is no
# UPRN or house-number field anywhere in propertyData, so those stay None.
#
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
# a JSON STRING holding a flat array where every integer inside a container is
# an index reference into that same array (so the graph can dedupe). We
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
# cannot — then rehydrate the reference graph before reading the address.
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
def _extract_page_model_literal(html: str) -> str | None:
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
Brace-matches with string/escape awareness so embedded braces and quotes in
string values don't end the match early. Returns None when absent."""
marker = _PAGE_MODEL_RE.search(html)
if not marker:
return None
start = marker.end()
if start >= len(html) or html[start] != "{":
return None
depth = 0
in_str = False
esc = False
for j in range(start, len(html)):
ch = html[j]
if in_str:
if esc:
esc = False
elif ch == "\\":
esc = True
elif ch == '"':
in_str = False
elif ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return html[start : j + 1]
return None
def _rehydrate(flat: list) -> object:
"""Resolve a devalue-style flattened reference array into a nested object.
Index 0 is the root; every int inside a dict/list is an index back into
``flat``. Memoised so shared/cyclic references resolve once."""
cache: dict[int, object] = {}
def resolve(idx: int) -> object:
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
return None
if idx in cache:
return cache[idx]
node = flat[idx]
if isinstance(node, dict):
out: dict = {}
cache[idx] = out
for key, value in node.items():
out[key] = resolve(value) if isinstance(value, int) else value
return out
if isinstance(node, list):
arr: list = []
cache[idx] = arr
for value in node:
arr.append(resolve(value) if isinstance(value, int) else value)
return arr
cache[idx] = node
return node
return resolve(0)
def parse_detail_postcode(html: str) -> str | None:
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
Pure and network-free so it is unit-testable: callers pass the page HTML.
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
page has no parseable address (the property location wrapper can be empty
the caller then keeps the coordinate fallback). The returned outcode is
re-validated against the joined postcode so a malformed incode is dropped.
"""
if not html:
return None
literal = _extract_page_model_literal(html)
if not literal:
return None
try:
outer = json.loads(literal)
flat = json.loads(outer["data"])
except (ValueError, KeyError, TypeError):
return None
if not isinstance(flat, list) or not flat:
return None
root = _rehydrate(flat)
if not isinstance(root, dict):
return None
property_data = root.get("propertyData")
if not isinstance(property_data, dict):
return None
address = property_data.get("address")
if not isinstance(address, dict):
return None
outcode = address.get("outcode")
incode = address.get("incode")
if not isinstance(outcode, str) or not isinstance(incode, str):
return None
outcode, incode = outcode.strip(), incode.strip()
if not outcode or not incode:
return None
# Round-trip through the shared postcode validator/normaliser: this both
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
# a structurally-valid UK postcode.
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
# listingId -> true full postcode (or None when unavailable). Failures are
# cached too, so a broken/duplicate listing is fetched at most once per run (the
# same listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
"""GET a listing detail page and return its true full postcode (or None).
Results (including failures) are cached by listing id. The detail page is a
plain HTML GET no Cloudflare, unlike Zoopla so a single httpx call
suffices; any error degrades gracefully to the coordinate fallback."""
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
postcode: str | None = None
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
try:
resp = client.get(url, headers={"Accept": "text/html"})
if resp.status_code == 200:
postcode = parse_detail_postcode(resp.text)
else:
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
except httpx.HTTPError as exc:
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
_detail_postcode_cache[property_id] = postcode
return postcode
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API.""" """Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in outcode_cache: if outcode in outcode_cache:
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
return None return None
def _detail_postcode_for(
client: httpx.Client,
prop: dict,
fetch_details: bool,
detail_budget: dict,
) -> str | None:
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
Cached listings are always served (they cost neither a cap slot nor a GET);
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
if not fetch_details:
return None
property_id = str(prop.get("id") or "")
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
if detail_budget["remaining"] <= 0:
return None
detail_budget["remaining"] -= 1
postcode = _fetch_detail_postcode(client, property_id)
time.sleep(DELAY_BETWEEN_PAGES)
return postcode
def _paginate( def _paginate(
client: httpx.Client, client: httpx.Client,
outcode_id: str, outcode_id: str,
@ -51,11 +251,19 @@ def _paginate(
channel_cfg: dict, channel_cfg: dict,
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
max_properties: int | None = None, max_properties: int | None = None,
fetch_details: bool = False,
detail_cap: int = 0,
) -> tuple[list[dict], int]: ) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count).""" """Paginate through search results. Returns (properties, result_count).
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
their detail page fetched for the property's TRUE full postcode (see
``parse_detail_postcode``); the rest fall back to coordinate-derived
postcodes."""
properties = [] properties = []
index = 0 index = 0
result_count = 0 result_count = 0
detail_budget = {"remaining": detail_cap}
while True: while True:
params = { params = {
@ -82,7 +290,12 @@ def _paginate(
for prop in raw_props: for prop in raw_props:
try: try:
transformed = transform_property(prop, outcode, pc_index) detail_postcode = _detail_postcode_for(
client, prop, fetch_details, detail_budget
)
transformed = transform_property(
prop, outcode, pc_index, detail_postcode=detail_postcode
)
except Exception as exc: except Exception as exc:
log.warning( log.warning(
"Rightmove %s/%s property %s failed to transform: %s", "Rightmove %s/%s property %s failed to transform: %s",
@ -127,7 +340,12 @@ def search_outcode(
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
max_properties: int | None = None, max_properties: int | None = None,
) -> list[dict]: ) -> list[dict]:
"""Paginate through unfiltered sale results for one outcode+channel.""" """Paginate through unfiltered sale results for one outcode+channel.
Each listing's detail page is fetched for the property's TRUE full postcode
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
coordinate-derived postcode."""
properties, _ = _paginate( properties, _ = _paginate(
client, client,
outcode_id, outcode_id,
@ -135,6 +353,8 @@ def search_outcode(
channel_cfg, channel_cfg,
pc_index, pc_index,
max_properties=max_properties, max_properties=max_properties,
fetch_details=RIGHTMOVE_FETCH_DETAILS,
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
) )
if max_properties is not None and len(properties) >= max_properties: if max_properties is not None and len(properties) >= max_properties:

View file

@ -15,6 +15,10 @@ from constants import (
DATA_DIR, DATA_DIR,
DELAY_BETWEEN_OUTCODES, DELAY_BETWEEN_OUTCODES,
LONDON_OUTCODE_PREFIXES, LONDON_OUTCODE_PREFIXES,
ZOOPLA_DETAIL_BUDGET_FRACTION,
ZOOPLA_FETCH_DETAILS,
ZOOPLA_FETCHER,
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
) )
from http_client import make_client from http_client import make_client
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
return timeout return timeout
def _zoopla_detail_cap() -> int:
"""Max detail-page fetches per outcode (0 disables detail fetching).
Zoopla search cards only expose an outcode-level address, so the full
postcode/coordinates come from each listing's detail page. The cap bounds
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
def _open_zoopla_detail_tab(page, detail_cap: int):
"""Open a second tab on the same context for detail-page fetches.
Sharing the persistent context means the detail tab inherits the search
tab's Cloudflare clearance cookies. Returns None when detail fetching is
disabled or the tab cannot be created (the scrape then degrades to
outcode-level postcodes rather than failing)."""
if detail_cap <= 0:
return None
try:
return page.context.new_page()
except Exception as exc:
log.warning(
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
_exception_detail(exc),
)
return None
@contextmanager @contextmanager
def _wall_clock_timeout(seconds: int, label: str): def _wall_clock_timeout(seconds: int, label: str):
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry. """SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc)) log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
def _scrape_zoopla_flaresolverr(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
from flaresolverr import FlareSolverrError, FlareSolverrSession
from zoopla_flaresolverr import search_outcode as fs_search_outcode
try:
session = FlareSolverrSession(session="zoopla")
session.__enter__()
except FlareSolverrError as exc:
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
return
try:
for outcode in outcodes:
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
if remaining == 0:
log.info("Zoopla cap reached")
return
try:
props, _ = fs_search_outcode(
outcode,
pc_index,
pc_coords,
session,
max_properties=remaining,
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
)
added = _store_properties(results, "zoopla", props, max_properties_per_source)
log.info("Zoopla %s: +%d", outcode, added)
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
_record_error(errors, "zoopla", outcode, exc)
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
session.__exit__(None, None, None)
def _scrape_zoopla( def _scrape_zoopla(
outcodes: list[str], outcodes: list[str],
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
@ -446,6 +524,12 @@ def _scrape_zoopla(
errors: list[str], errors: list[str],
max_properties_per_source: int | None, max_properties_per_source: int | None,
) -> None: ) -> None:
if ZOOPLA_FETCHER == "flaresolverr":
_scrape_zoopla_flaresolverr(
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
)
return
try: try:
browser, page = _launch_zoopla_with_retries() browser, page = _launch_zoopla_with_retries()
except Exception as exc: except Exception as exc:
@ -454,6 +538,12 @@ def _scrape_zoopla(
return return
outcode_timeout = _zoopla_outcode_timeout_seconds() outcode_timeout = _zoopla_outcode_timeout_seconds()
detail_cap = _zoopla_detail_cap()
detail_page = _open_zoopla_detail_tab(page, detail_cap)
# Spend at most a fraction of each outcode's budget on detail fetches so the
# SIGALRM guard never trips mid-outcode and discards already-collected
# search listings; the rest is left for search pagination and transform.
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
try: try:
for outcode in outcodes: for outcode in outcodes:
@ -470,6 +560,9 @@ def _scrape_zoopla(
pc_index, pc_index,
pc_coords, pc_coords,
max_properties=None, max_properties=None,
detail_page=detail_page,
detail_cap=detail_cap,
detail_budget_seconds=detail_budget_seconds,
) )
added = _store_properties( added = _store_properties(
results, results,
@ -496,6 +589,8 @@ def _scrape_zoopla(
_close_zoopla_browser(browser, f"zoopla {outcode}") _close_zoopla_browser(browser, f"zoopla {outcode}")
try: try:
browser, page = _launch_zoopla_with_retries() browser, page = _launch_zoopla_with_retries()
# The old context (and its detail tab) is gone; reopen one.
detail_page = _open_zoopla_detail_tab(page, detail_cap)
log.info("Zoopla %s retrying with fresh browser", outcode) log.info("Zoopla %s retrying with fresh browser", outcode)
except Exception as relaunch_exc: except Exception as relaunch_exc:
_record_error(errors, "zoopla", outcode, relaunch_exc) _record_error(errors, "zoopla", outcode, relaunch_exc)
@ -503,6 +598,11 @@ def _scrape_zoopla(
time.sleep(DELAY_BETWEEN_OUTCODES) time.sleep(DELAY_BETWEEN_OUTCODES)
finally: finally:
if detail_page is not None:
try:
detail_page.close()
except Exception:
pass
_close_zoopla_browser(browser, "zoopla final") _close_zoopla_browser(browser, "zoopla final")

View file

@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"Address per Property Register": [ "Address per Property Register": [
p["Address per Property Register"] for p in properties p["Address per Property Register"] for p in properties
], ],
# UPRN (when the scraper recovered it) keys an exact listing->EPC
# join; Property number or name is the house identifier for the
# Price-Paid address join. Both are None for sources/listings without
# a detail-page fetch.
"UPRN": [p.get("UPRN") for p in properties],
"Property number or name": [
p.get("Property number or name") for p in properties
],
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties], "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
"Property type": [p["Property type"] for p in properties], "Property type": [p["Property type"] for p in properties],
"Property sub-type": [p["Property sub-type"] for p in properties], "Property sub-type": [p["Property sub-type"] for p in properties],
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"Inferred postcode": pl.Utf8, "Inferred postcode": pl.Utf8,
"Listing raw address": pl.Utf8, "Listing raw address": pl.Utf8,
"Address per Property Register": pl.Utf8, "Address per Property Register": pl.Utf8,
"UPRN": pl.Utf8,
"Property number or name": pl.Utf8,
"Leasehold/Freehold": pl.Utf8, "Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8, "Property type": pl.Utf8,
"Property sub-type": pl.Utf8, "Property sub-type": pl.Utf8,

206
finder/test_onthemarket.py Normal file
View file

@ -0,0 +1,206 @@
"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
the property's own postcode lives in the analytics dataLayer
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
postcode sits separately under `property.agent.postcode` the trap we must not
fall into.
"""
import json
import onthemarket
from onthemarket import parse_detail_postcode, transform_property
class _StubIndex:
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
def __init__(self, postcode: str | None):
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str | None:
return self._postcode
def _detail_html(
*,
property_id: int = 19522441,
datalayer_postcode: str = "SE5 9AA",
agent_postcode: str = "SE5 8RS",
) -> str:
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
next_data = {
"props": {
"initialReduxState": {
"metadata": {
"dataLayer": {
"page-type": "details-section",
"property-type": "homes",
# The property's own unit postcode.
"postcode": datalayer_postcode,
"property-id": property_id,
"price": "275,000",
"addressline_2": "Padfield Road",
}
},
"property": {
"displayAddress": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
# The agent block carries the AGENT'S office postcode — the
# trap. parse_detail_postcode must not return this.
"agent": {
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
"postcode": agent_postcode,
},
},
}
}
}
payload = json.dumps(next_data)
return (
"<html><body>"
'<script id="__NEXT_DATA__" type="application/json">'
f"{payload}"
"</script></body></html>"
)
# ---------------------------------------------------------------------------
# parse_detail_postcode
# ---------------------------------------------------------------------------
def test_parse_returns_property_postcode_not_agent():
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_normalizes_spacing():
html = _detail_html(datalayer_postcode="se59aa")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_ignores_mismatched_property_id():
# dataLayer postcode belongs to property 19522441; asking for a different
# listing id must refuse to return it.
html = _detail_html(property_id=19522441)
assert parse_detail_postcode(html, "99999999") is None
def test_parse_accepts_when_no_listing_id_given():
html = _detail_html(datalayer_postcode="SE5 9AA")
assert parse_detail_postcode(html, None) == "SE5 9AA"
def test_parse_handles_missing_postcode():
html = _detail_html(datalayer_postcode="")
assert parse_detail_postcode(html, "19522441") is None
def test_parse_handles_no_next_data():
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
def test_parse_handles_empty_html():
assert parse_detail_postcode("", "1") is None
def test_parse_handles_malformed_json():
html = (
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
)
assert parse_detail_postcode(html, "1") is None
def test_parse_handles_missing_datalayer():
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
html = (
'<script id="__NEXT_DATA__" type="application/json">'
f"{json.dumps(next_data)}</script>"
)
assert parse_detail_postcode(html, "1") is None
# ---------------------------------------------------------------------------
# transform_property — detail postcode wiring + trust rule
# ---------------------------------------------------------------------------
_RAW_LISTING = {
"id": "19522441",
"address": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
"bedrooms": 2,
"bathrooms": 1,
"price": "£275,000",
"humanised-property-type": "Apartment",
"features": ["Tenure: Leasehold (99 years remaining)"],
"details-url": "/details/19522441/",
}
def test_transform_uses_trusted_detail_postcode():
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
# the (more precise) detail postcode and label it detail_address.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
assert out is not None
assert out["Postcode"] == "SE5 9AA"
assert out["Postcode source"] == "detail_address"
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
# outcode -> reject the detail postcode, fall back to coordinate logic.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
def test_transform_without_detail_postcode_uses_coordinates():
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
# No UPRN / house number is recoverable from OnTheMarket.
assert out["UPRN"] is None
assert out["Property number or name"] is None
def test_transform_detail_postcode_via_search_address_outcode():
# When the card address already carries a full postcode that agrees with the
# coordinates, the existing "address" source still wins absent a detail
# postcode — detail recovery never regresses that path.
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
index = _StubIndex("SE5 1AA")
out = transform_property(raw, index, detail_postcode=None)
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "address"
# ---------------------------------------------------------------------------
# _fetch_detail_postcode caching (no real network)
# ---------------------------------------------------------------------------
def test_fetch_detail_postcode_is_cached(monkeypatch):
onthemarket._detail_postcode_cache.clear()
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
def _boom(*args, **kwargs): # pragma: no cover - must never be called
raise AssertionError("network was hit despite a cached value")
# Any httpx use would explode; the cache hit must short-circuit first.
result = onthemarket._fetch_detail_postcode(
client=type("C", (), {"get": _boom})(),
details_url="/details/19522441/",
listing_id="19522441",
)
assert result == "SE5 9AA"
onthemarket._detail_postcode_cache.clear()

113
finder/test_rightmove.py Normal file
View file

@ -0,0 +1,113 @@
"""Tests for the Rightmove detail-page postcode extractor.
The search API only returns an outcode-level ``displayAddress``; the property's
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
graph whose ``data`` field is a JSON STRING of a flat array where every integer
inside a container is an index reference into that same array.
"""
import json
from rightmove import _extract_page_model_literal, parse_detail_postcode
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
return (
"<html><head></head><body>\n"
"<script>\n"
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
"</script>\n"
"</body></html>"
)
# A faithful slice of a real listing: root -> propertyData -> address, with a
# decoy nearestStations array (which carries NO postcodes on the live page) to
# prove the parser anchors on the property's own address, not a nearby POI.
_FLAT_SW9 = [
{"propertyData": 1}, # 0: root
{
"id": "89089584",
"address": 2,
"location": 4,
"nearestStations": 6,
}, # 1: propertyData
{
"displayAddress": "Caldwell Street, Stockwell",
"countryCode": "GB",
"ukCountry": "England",
"outcode": "SW9",
"incode": "0HD",
}, # 2: address
None, # 3: filler
{
"latitude": 51.477238,
"longitude": -0.116819,
"pinType": "ACCURATE_POINT",
}, # 4: location
None, # 5: filler
[7, 8], # 6: nearestStations (references)
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
]
def test_parses_full_postcode_from_outcode_and_incode() -> None:
html = _page_model_html(_FLAT_SW9)
assert parse_detail_postcode(html) == "SW9 0HD"
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
# The literal must include the whole nested object, not stop at the first
# closing brace inside the escaped data string.
html = _page_model_html(_FLAT_SW9)
literal = _extract_page_model_literal(html)
assert literal is not None
assert literal.startswith("{") and literal.endswith("}")
# Round-trips back to a dict with the expected top-level keys.
assert set(json.loads(literal)) == {"data", "encoding"}
def test_normalises_unspaced_incode() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
def test_returns_none_when_address_missing() -> None:
# The location wrapper can be empty/absent on some listings; the caller then
# keeps the coordinate fallback, so we must return None (not raise).
flat = [
{"propertyData": 1},
{"id": "1", "location": 2},
{"latitude": 51.5, "longitude": -0.1},
]
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_when_incode_blank() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "incode": ""}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_for_non_postcode_pair() -> None:
# A structurally-invalid outcode/incode pair is rejected by the validator.
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_without_page_model() -> None:
assert parse_detail_postcode("") is None
assert parse_detail_postcode("<html><body>no model</body></html>") is None
# Malformed JSON in the data field degrades gracefully.
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
assert parse_detail_postcode(broken) is None

View file

@ -1,13 +1,19 @@
from transform import ( from transform import (
build_register_address,
clean_listing_address, clean_listing_address,
extract_full_postcode, extract_full_postcode,
extract_outcode,
resolve_listing_postcode,
transform_property, transform_property,
) )
class StubPostcodeIndex: class StubPostcodeIndex:
def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str: def nearest(self, lat: float, lng: float) -> str:
return "SW1A 9ZZ" return self._postcode
def test_extract_full_postcode_normalizes_spacing() -> None: def test_extract_full_postcode_normalizes_spacing() -> None:
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley" assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
def test_build_register_address_prepends_house_number_or_name() -> None:
# House number/name prepended, with the trailing outcode/postcode stripped.
assert (
build_register_address("South Street, Bromley BR1", "12")
== "12, South Street, Bromley"
)
assert (
build_register_address("Riverside, Martham NR29", "Martham Mill")
== "Martham Mill, Riverside, Martham"
)
# No number/name -> identical to the plain cleaned address.
assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
# Already starts with the number/name -> no duplication.
assert (
build_register_address("12 South Street, Bromley", "12")
== "12 South Street, Bromley"
)
# Empty/whitespace number/name is ignored.
assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley"
def test_extract_outcode() -> None:
assert extract_outcode("SW1A 2AA") == "SW1A"
assert extract_outcode("n4 2ha") == "N4"
assert extract_outcode("SW1A2AA") == "SW1A"
assert extract_outcode(None) is None
assert extract_outcode("") is None
def test_resolve_listing_postcode() -> None:
# Outcode matches -> trust the more precise extracted postcode.
assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
# Outcode mismatch -> fall back to the spatially-correct inferred postcode.
assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
# Well-formed but fabricated postcode in a different outcode is rejected.
assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
# No extracted postcode -> inferred is authoritative.
assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
def test_rightmove_transform_prefers_postcode_from_display_address() -> None: def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
prop = { prop = {
"id": "123", "id": "123",
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
assert result["Inferred postcode"] == "SW1A 9ZZ" assert result["Inferred postcode"] == "SW1A 9ZZ"
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA" assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street" assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
prop = {
"id": "124",
"location": {"latitude": 51.5, "longitude": -0.1},
"price": {"amount": 750000, "displayPrices": []},
"propertySubType": "Terraced",
"bedrooms": 3,
"bathrooms": 1,
"keyFeatures": [],
"propertyUrl": "/properties/124",
# Address postcode is in a different outcode than the coordinate-nearest one.
"displayAddress": "10 Downing Street, E14 9SS",
}
result = transform_property(prop, "SW1A", StubPostcodeIndex())
assert result is not None
# The spatially-correct inferred postcode wins over the mismatching extracted one.
assert result["Postcode"] == "SW1A 9ZZ"
assert result["Postcode source"] == "coordinates"
assert result["Extracted postcode"] == "E14 9SS"
def _rightmove_prop() -> dict:
return {
"id": "200",
"location": {"latitude": 51.5, "longitude": -0.1},
"price": {"amount": 750000, "displayPrices": []},
"propertySubType": "Terraced",
"bedrooms": 3,
"bathrooms": 1,
"keyFeatures": [],
"propertyUrl": "/properties/200",
# Search API only ever exposes the outcode in the display address.
"displayAddress": "Caldwell Street, Stockwell, SW9",
}
def test_rightmove_transform_prefers_detail_postcode() -> None:
# The detail page's true full postcode (same outcode as the location) is
# preferred over the coordinate-nearest guess.
result = transform_property(
_rightmove_prop(),
"SW9",
StubPostcodeIndex("SW9 7AA"),
detail_postcode="SW9 0HD",
)
assert result is not None
assert result["Postcode"] == "SW9 0HD"
assert result["Postcode source"] == "detail_address"
# The coordinate inference is still surfaced separately.
assert result["Inferred postcode"] == "SW9 7AA"
def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
# A detail postcode whose outcode disagrees with the location must not
# relocate the listing; the coordinate postcode wins instead.
result = transform_property(
_rightmove_prop(),
"SW9",
StubPostcodeIndex("SW9 7AA"),
detail_postcode="E14 9SS",
)
assert result is not None
assert result["Postcode"] == "SW9 7AA"
assert result["Postcode source"] == "coordinates"
def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
# No detail postcode -> behaviour is unchanged (coordinate-nearest).
result = transform_property(
_rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
)
assert result is not None
assert result["Postcode"] == "SW9 7AA"
assert result["Postcode source"] == "coordinates"

288
finder/test_zoopla.py Normal file
View file

@ -0,0 +1,288 @@
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
def test_detail_cache_key_uses_listing_id() -> None:
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
# No id in the URL -> fall back to the URL itself as the key.
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
class StubPostcodeIndex:
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
def __init__(self, postcode: str = "BR1 2AB") -> None:
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str:
return self._postcode
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
# verified detail-page snippet (well inside the England bounds check).
PC_COORDS = {
"BR1 2AB": (51.40, 0.01),
"SW1A 1AA": (51.50, -0.14),
"NR29 4RG": (52.716014, 1.614495),
}
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
# a self.__next_f flight chunk in page.content().
_LOCATION_ESCAPED = (
'<script>self.__next_f.push([1,"...'
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
'..."])</script>'
)
def test_parse_detail_geo_location_object_escaped() -> None:
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
assert geo == {
"lat": 52.716014,
"lng": 1.614495,
"postcode": "NR29 4RG",
"outcode": "NR29",
"source": "detail_location",
"uprn": "10023461458",
"number_or_name": "Martham Mill",
# No `address` twin in this snippet, so there is no full street address.
"full_address": None,
}
def test_parse_detail_geo_location_object_unescaped() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_address_twin() -> None:
html = (
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_address_obj"
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
assert geo["uprn"] == "10023461458"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
# Real detail pages carry both wrappers: the `location` object holds the
# uprn + house number/name, the `address` twin holds the full street
# address. They share a uprn, so the twin's fullAddress is attached.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG",'
'"propertyNumberOrName":"Martham Mill"}'
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["uprn"] == "10023461458"
assert geo["number_or_name"] == "Martham Mill"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
# The only `address` twin on the page belongs to a different uprn (a
# comparable listing). With a uprn to match on, an unrelated twin is never
# borrowed — full_address stays None rather than grabbing the wrong street.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["uprn"] == "10023461458"
assert geo["full_address"] is None
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
# by the property's own "location" wrapper. Anchoring on the wrapper means
# the POI's coordinates are ignored and the property's are returned.
poi = (
'"name":"Martham Community Centre","numberOfConnectors":2,'
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
)
prop = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(poi + prop)
assert geo is not None
assert geo["source"] == "detail_location"
# The property's coords win, not the community centre's.
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
# Page embeds two location objects (e.g. a comparable then the property).
# With a search outcode, the one in that outcode is preferred; without one,
# the first (document order = primary listing) is returned.
comparable = (
'"location":{"outcode":"EN2",'
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
'"postalCode":"EN2 6SN"}'
)
target = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
assert geo is not None and geo["postcode"] == "NR29 4RG"
geo_first = parse_detail_geo(comparable + target)
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
def test_parse_detail_geo_rejects_out_of_england() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":10.0,"longitude":10.0},'
'"uprn":"1","postalCode":"NR29 4RG"}'
)
assert parse_detail_geo(html) is None
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
# keep the coordinates, drop the untrustworthy postcode.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"1","postalCode":"AB12 3CD"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["lat"] == 52.716014
assert geo["postcode"] is None
def test_parse_detail_geo_returns_none_for_garbage() -> None:
assert parse_detail_geo("<html><body>no data here</body></html>") is None
assert parse_detail_geo("") is None
# Coordinates that are not inside a property location/address wrapper (e.g.
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
def _raw(**overrides) -> dict:
raw = {
"id": "123",
"url": "/for-sale/details/123/",
"address": "South Street, Bromley BR1",
"price": 500000,
"beds": 2,
"baths": 1,
"property_type": "Flat",
}
raw.update(overrides)
return raw
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
assert result["Postcode"] == "BR1 3CD"
assert result["Postcode source"] == "detail_address"
assert result["Inferred postcode"] == "BR1 2AB"
assert (result["lat"], result["lon"]) == (51.401, 0.011)
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Mismatching detail postcode is rejected in favour of the spatial value.
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "detail_coordinates"
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
result = transform_property(
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["Postcode"] == "SW1A 1AA"
assert result["Postcode source"] == "detail_address"
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
result = transform_property(
_raw(address="A street with no postcode"),
StubPostcodeIndex(),
PC_COORDS,
search_outcode="BR1",
detail=None,
)
assert result is not None
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "search_outcode"
# No detail page -> no UPRN / house number recovered.
assert result["UPRN"] is None
assert result["Property number or name"] is None
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
detail = {
"lat": 51.401,
"lng": 0.011,
"postcode": "BR1 3CD",
"outcode": "BR1",
"uprn": "100023461458",
"number_or_name": "12",
"full_address": "South Street, Bromley BR1",
}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["UPRN"] == "100023461458"
assert result["Property number or name"] == "12"
# The detail full address replaces the outcode-level card address, and the
# house number is prepended for a near-exact Property Register match.
assert result["Listing raw address"] == "South Street, Bromley BR1"
assert result["Address per Property Register"] == "12, South Street, Bromley"
def test_transform_ignores_out_of_england_detail_coords() -> None:
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Bad detail coords are discarded; falls through to the address outcode (BR1).
assert result["Postcode source"] == "address_outcode"
assert 49 <= result["lat"] <= 56

View file

@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
return normalize_postcode(match.group(1)) return normalize_postcode(match.group(1))
def extract_outcode(postcode: str | None) -> str | None:
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA''SW1A'."""
if not postcode:
return None
normalized = normalize_postcode(postcode)
outcode = normalized.split(" ", 1)[0]
return outcode or None
def resolve_listing_postcode(
extracted_postcode: str | None, inferred_postcode: str
) -> tuple[str, str]:
"""Pick the authoritative postcode for a listing, returning (postcode, source).
The address-extracted postcode is more precise than the coordinate-nearest one,
but it is only trustworthy when it agrees with the location: a stale, mistyped or
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
override the spatially-correct value. Since the spatial index only supports
nearest-lookup, accept the extracted postcode only when its outcode matches the
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
inferred one, which is always a real, plausibly-correct postcode.
"""
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
inferred_postcode
):
return extracted_postcode, "address"
if extracted_postcode:
log.debug(
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
extracted_postcode,
inferred_postcode,
)
return inferred_postcode, "coordinates"
def clean_listing_address(address: str | None) -> str: def clean_listing_address(address: str | None) -> str:
"""Remove postcode/outcode suffixes from listing display addresses. """Remove postcode/outcode suffixes from listing display addresses.
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
return cleaned.strip(" ,") return cleaned.strip(" ,")
def build_register_address(
raw_address: str | None, number_or_name: str | None = None
) -> str:
"""Build a Property Register-style address, prepending the house number/name.
Listing display addresses are usually street-level ("South Street, Bromley")
because the portals hide the exact unit. When a scraper can recover the
property's own number or name (e.g. Zoopla detail pages expose
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
carries the house identifier that the EPC/Price-Paid register addresses also
use turning a fuzzy street match into a near-exact one. Falls back to the
plain cleaned address when no number/name is available.
"""
cleaned = clean_listing_address(raw_address)
if not number_or_name:
return cleaned
number_or_name = number_or_name.strip()
if not number_or_name:
return cleaned
# Avoid duplicating a number/name the display address already starts with.
if cleaned.lower().startswith(number_or_name.lower()):
return cleaned
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
def transform_property( def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex prop: dict,
outcode: str,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None: ) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema.""" """Transform a raw Rightmove property dict into our output schema.
``detail_postcode`` is the property's TRUE full postcode recovered from its
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
only exposes the outcode-level ``displayAddress``. When supplied and it
agrees with the coordinate-nearest postcode's outcode, it is preferred over
the coordinate guess and recorded with source ``"detail_address"``. A
detail postcode whose outcode disagrees with the location is discarded in
favour of the spatially-correct coordinate postcode, so a stale or wrong
detail value can never silently relocate a listing.
"""
loc = prop.get("location") loc = prop.get("location")
if not loc: if not loc:
return None return None
@ -268,8 +341,25 @@ def transform_property(
return None return None
raw_address = prop.get("displayAddress", "") or "" raw_address = prop.get("displayAddress", "") or ""
extracted_postcode = extract_full_postcode(raw_address) extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates" # Prefer the detail page's true full postcode when it agrees with the
# location; otherwise fall back to the (display-address-or-coordinate) logic.
detail_full = extract_full_postcode(detail_postcode)
if detail_full and extract_outcode(detail_full) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_full, "detail_address"
else:
if detail_full:
log.debug(
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
"inferred %s)",
detail_full,
inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
property_url = prop.get("propertyUrl") or "" property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str): if not isinstance(property_url, str):
@ -291,6 +381,9 @@ def transform_property(
"Inferred postcode": inferred_postcode, "Inferred postcode": inferred_postcode,
"Listing raw address": raw_address, "Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address), "Address per Property Register": clean_listing_address(raw_address),
# Rightmove's displayAddress is street-level; no UPRN/house number.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": extract_tenure(prop.get("tenure")), "Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type), "Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type), "Property sub-type": normalize_sub_type(sub_type),

View file

@ -32,16 +32,24 @@ import httpx
from constants import ( from constants import (
DATA_DIR, DATA_DIR,
DELAY_BETWEEN_PAGES, DELAY_BETWEEN_PAGES,
GLUETUN_API_KEY,
GLUETUN_CONTROL_URL,
GLUETUN_MAX_ROTATIONS,
GLUETUN_PROXY,
MAX_BEDROOMS, MAX_BEDROOMS,
PROPERTY_TYPE_MAP, PROPERTY_TYPE_MAP,
ZOOPLA_BASE, ZOOPLA_BASE,
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
) )
from spatial import PostcodeSpatialIndex from spatial import PostcodeSpatialIndex
from transform import ( from transform import (
clean_listing_address, build_register_address,
extract_full_postcode, extract_full_postcode,
extract_outcode,
fix_coords,
normalize_sub_type, normalize_sub_type,
parse_int_value, parse_int_value,
resolve_listing_postcode,
validate_floor_area, validate_floor_area,
) )
@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
# cookies (bound to the previous IP), then reload and re-check the challenge. # cookies (bound to the previous IP), then reload and re-check the challenge.
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
def _gluetun_base_url() -> str: def _gluetun_base_url() -> str:
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/") return GLUETUN_CONTROL_URL.rstrip("/")
def _gluetun_api_key() -> str | None: def _gluetun_api_key() -> str | None:
return _GLUETUN_API_KEY return GLUETUN_API_KEY
def _gluetun_max_rotations() -> int: def _gluetun_max_rotations() -> int:
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3") return max(GLUETUN_MAX_ROTATIONS, 0)
try:
value = int(raw)
except ValueError as exc:
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
return max(value, 0)
def _gluetun_client() -> httpx.Client: def _gluetun_client() -> httpx.Client:
# Talks to the control server directly (not through the VPN proxy).
headers = {} headers = {}
api_key = _gluetun_api_key() api_key = _gluetun_api_key()
if api_key: if api_key:
@ -694,10 +695,19 @@ def launch_browser():
profile_dir.mkdir(parents=True, exist_ok=True) profile_dir.mkdir(parents=True, exist_ok=True)
_remove_stale_profile_locks(profile_dir) _remove_stale_profile_locks(profile_dir)
# Route the browser through the Gluetun VPN proxy when configured. (geoip
# fingerprint alignment is intentionally not enabled: it needs the optional
# camoufox[geoip] extra and would spoof to the VPN exit's country, which
# fights the en-GB locale unless the exit is in the UK.)
proxy_options: dict = {}
if GLUETUN_PROXY:
proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
log.info( log.info(
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...", "Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
headless_mode, headless_mode,
profile_dir, profile_dir,
GLUETUN_PROXY or "direct",
) )
camoufox = Camoufox( camoufox = Camoufox(
headless=headless_mode, headless=headless_mode,
@ -705,6 +715,7 @@ def launch_browser():
user_data_dir=str(profile_dir), user_data_dir=str(profile_dir),
locale=["en-GB", "en"], locale=["en-GB", "en"],
enable_cache=True, enable_cache=True,
**proxy_options,
) )
raw_browser = camoufox.__enter__() raw_browser = camoufox.__enter__()
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser) browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
@ -926,13 +937,47 @@ def _paginate(
page, page,
total_results: int, total_results: int,
max_properties: int | None = None, max_properties: int | None = None,
fetch_detail=None,
detail_cap: int = 0,
detail_state: dict | None = None,
detail_deadline: float | None = None,
) -> list[dict]: ) -> list[dict]:
"""Extract listings from all pages of search results. """Extract listings from all pages of search results.
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
next link when present, otherwise advance via the pn=N URL parameter while next link when present, otherwise advance via the pn=N URL parameter while
the advertised result count says more listings remain.""" the advertised result count says more listings remain.
When ``fetch_detail`` is supplied, each listing has its detail page fetched
(up to ``detail_cap`` fresh loads per outcode, counted in the shared
``detail_state`` dict, and only until ``detail_deadline``) and the parsed
geo stored under ``listing['_detail']`` for ``transform_property``. The
detail page is the only source of the listing's UPRN, full street address
and precise postcode, so it is fetched even when the search card already
pins a full postcode. Cached detail results are always attached but cost
neither a cap slot nor a delay."""
def _maybe_fetch(listing: dict) -> None:
if fetch_detail is None or detail_state is None:
return
url = listing.get("url", "")
cached = _detail_cache_key(url) in _detail_cache
if not cached:
# Fresh loads are bounded by the per-outcode cap and the wall-clock
# deadline so detail fetching never starves the SIGALRM budget that
# also guards the search pagination for this outcode.
if detail_state["fetched"] >= detail_cap:
return
if detail_deadline is not None and time.monotonic() >= detail_deadline:
return
listing["_detail"] = fetch_detail(url)
if not cached:
detail_state["fetched"] += 1
time.sleep(DELAY_BETWEEN_PAGES)
all_listings = _extract_listings(page) all_listings = _extract_listings(page)
for listing in all_listings:
_maybe_fetch(listing)
if max_properties is not None and len(all_listings) >= max_properties: if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties] return all_listings[:max_properties]
@ -984,6 +1029,7 @@ def _paginate(
if listing["id"] not in seen_ids: if listing["id"] not in seen_ids:
seen_ids.add(listing["id"]) seen_ids.add(listing["id"])
all_listings.append(listing) all_listings.append(listing)
_maybe_fetch(listing)
new_count += 1 new_count += 1
if max_properties is not None and len(all_listings) >= max_properties: if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties] return all_listings[:max_properties]
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
return None return None
# ---------------------------------------------------------------------------
# Detail-page geocoding
# ---------------------------------------------------------------------------
#
# Zoopla search result cards only expose an outcode-level display address (e.g.
# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
# only on each listing's detail page (/for-sale/details/{id}/). The detail page
# is a Next.js App Router route whose React Server Components flight stream
# embeds the property's own location object, e.g.
# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
# "uprn":"10023461458","postalCode":"NR29 4RG",...}
# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
# Nearby points of interest (stations, schools, EV chargers) and comparable
# listings carry their own "coordinates" too, but never inside the property's
# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
# loose coordinates object, is what we anchor on (see parse_detail_geo).
# listingId -> parsed detail dict (or None). Failures are cached too, so a
# broken listing is not re-fetched within a run (the same listing reappears
# across overlapping outcode searches).
_detail_cache: dict[str, dict | None] = {}
_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
# The property's own location is carried by a `"location":{...}` wrapper and a
# twin `"address":{"fullAddress":...}` widget object. We anchor on those
# wrappers (and capture their full object body, which contains exactly one
# nested object — `coordinates`) rather than scanning for loose coordinate
# objects: nearby points of interest (stations/schools/EV chargers) and
# comparable/"similar" listings also embed coordinates, but never inside the
# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
_DETAIL_COORDS_IN_BODY_RE = re.compile(
r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
)
_DETAIL_LATLNG_IN_BODY_RE = re.compile(
r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
)
_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
# The location object spells it "postalCode"; the address twin uses "postcode".
_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
# The UPRN (Unique Property Reference Number) appears in both the location and
# address objects and is the linchpin for an exact listing->EPC join (EPC open
# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
# (e.g. "12", "Martham Mill") only in the location object.
_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
"""Extract the property's own coordinates/postcode from a Zoopla detail page.
Pure and browser-free: the live browser only produces the HTML string
(``page.content()``); this does the parsing so it is unit-testable.
Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
"number_or_name", "full_address"}`` (every field except the coordinates may
be ``None``) or ``None`` when no property location wrapper is found. The
``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
number/name) and ``full_address`` give a register-style address for the
Price Paid join.
Coordinates are bounds-checked to England and a postcode is kept only when
it agrees with its own object's outcode. ``search_outcode``, when given, is
used only as a tie-break to pick the right ``location`` object on pages that
also embed comparable listings. See module docstring for the data model."""
if not html:
return None
# RSC flight strings are embedded as escaped JS string literals, so quotes
# and slashes arrive escaped; normalize them so the regexes match.
buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
def in_england(lat: float, lng: float) -> tuple[float, float] | None:
lat, lng = fix_coords(lat, lng)
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
return None
def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
# outcode and postcode are read from the SAME object body as the coords,
# so the postcode is self-consistent; drop it only if it somehow isn't.
outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
outcode = outcode_match.group(1) if outcode_match else None
postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
if postcode and outcode and extract_outcode(postcode) != outcode.upper():
postcode = None
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
number_or_name = number_match.group(1).strip() if number_match else None
return {
"lat": coords[0],
"lng": coords[1],
"postcode": postcode,
"outcode": outcode,
"source": source,
"uprn": uprn_match.group(1) if uprn_match else None,
"number_or_name": number_or_name or None,
"full_address": full_address,
}
def attach_full_address(result: dict | None) -> dict | None:
# The house-numbered street address lives in the `address` map-widget
# twin, not the `location` wrapper we anchor coordinates on. Pull it from
# the twin that shares this property's uprn; when there is no uprn to
# disambiguate, fall back to the first twin (document order = primary
# listing), but never guess a twin when a uprn exists and none matches —
# that would risk grabbing a comparable listing's address.
if result is None or result.get("full_address"):
return result
target = result.get("uprn")
first = None
for match in _DETAIL_ADDRESS_RE.finditer(buf):
full_address = match.group(1) or None
if full_address is None:
continue
if first is None:
first = full_address
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
if target and uprn_match and uprn_match.group(1) == target:
result["full_address"] = full_address
return result
if target is None:
result["full_address"] = first
return result
# Strategy 1 — the property's own `location` wrapper (authoritative). Take
# the first match (the primary listing precedes any comparables in the
# flight stream), but prefer one whose outcode matches the searched outcode.
first_location = None
for match in _DETAIL_LOCATION_RE.finditer(buf):
body = match.group(1)
coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
if not coords_match:
continue
coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
if not coords:
continue
candidate = build(body, coords, "detail_location")
if first_location is None:
first_location = candidate
if (
search_outcode
and candidate["outcode"]
and candidate["outcode"].upper() == search_outcode.upper()
):
return attach_full_address(candidate)
if first_location is not None:
return attach_full_address(first_location)
# Strategy 2 — the `address` map-widget twin (same coordinates, backup).
for match in _DETAIL_ADDRESS_RE.finditer(buf):
full_address = match.group(1) or None
body = match.group(2)
latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
if not latlng_match:
continue
coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
if coords:
return build(body, coords, "detail_address_obj", full_address=full_address)
return None
def _detail_cache_key(listing_url: str) -> str:
"""Cache key for a listing detail page — its numeric id when present."""
id_match = _LISTING_ID_RE.search(listing_url)
return id_match.group(1) if id_match else listing_url
def _fetch_listing_detail(
detail_page,
listing_url: str,
search_outcode: str | None = None,
) -> dict | None:
"""Load a listing detail page and return its parsed geo dict (or None).
Results (including failures) are cached by listingId. Ordinary navigation
and extraction errors are swallowed so the caller can fall back to
outcode-level resolution, but TurnstileError is allowed to propagate so the
scraper's "Cloudflare ends the run" contract still holds. The goto timeout
is kept short so one slow detail page can't eat the per-outcode budget."""
cache_key = _detail_cache_key(listing_url)
if cache_key in _detail_cache:
return _detail_cache[cache_key]
url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
result: dict | None = None
try:
detail_page.goto(
url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
)
_ensure_not_challenged(detail_page)
html = detail_page.content()
result = parse_detail_geo(html, search_outcode=search_outcode)
except TurnstileError:
raise
except Exception as exc:
log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
result = None
_detail_cache[cache_key] = result
return result
def _map_property_type(raw_type: str | None) -> str: def _map_property_type(raw_type: str | None) -> str:
"""Map Zoopla property type text to canonical type.""" """Map Zoopla property type text to canonical type."""
if not raw_type: if not raw_type:
@ -1109,28 +1363,64 @@ def transform_property(
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]], pc_coords: dict[str, tuple[float, float]],
search_outcode: str | None = None, search_outcode: str | None = None,
detail: dict | None = None,
) -> dict | None: ) -> dict | None:
"""Transform a raw Zoopla listing dict into the standard output schema. """Transform a raw Zoopla listing dict into the standard output schema.
Zoopla search cards do not include coordinates, so we resolve lat/lng Zoopla search cards only expose an outcode-level address, so precise
from postcodes extracted from the address text.""" location comes from the listing's detail page (see ``parse_detail_geo`` /
``_fetch_listing_detail``), passed in as ``detail``. When detail-page
coordinates are available we resolve the nearest postcode via the spatial
index mirroring rightmove/onthemarket and only fall back to the coarse
outcode centroid when no detail location could be obtained."""
price = parse_int_value(raw.get("price")) or 0 price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "") or "" address = raw.get("address", "") or ""
# Resolve postcode and coordinates from address
extracted_postcode = extract_full_postcode(address) extracted_postcode = extract_full_postcode(address)
postcode = extracted_postcode detail = detail or {}
postcode_source = "address" if extracted_postcode else None detail_postcode = extract_full_postcode(detail.get("postcode"))
# Detail-page address fields: the UPRN keys an exact EPC join, and the
# full street address / house number-or-name beat the outcode-level card
# address for the Price-Paid join. All three are absent unless the detail
# page was fetched, so every consumer must tolerate None.
detail_uprn = detail.get("uprn") or None
detail_full_address = detail.get("full_address") or None
detail_number_or_name = detail.get("number_or_name") or None
postcode = postcode_source = inferred_postcode = None
lat = lng = None lat = lng = None
if postcode: # (A) Best: detail-page coordinates -> nearest postcode (authoritative).
coords = pc_coords.get(postcode) detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
if coords: if detail_lat is not None and detail_lng is not None:
lat, lng = coords fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
nearest = pc_index.nearest(fixed_lat, fixed_lng)
if nearest:
lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
candidate = detail_postcode or extracted_postcode
postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
postcode_source = (
"detail_address"
if resolved_source == "address"
else "detail_coordinates"
)
# (B) Detail-page postcode without usable coordinates -> geocode it.
if lat is None and detail_postcode and detail_postcode in pc_coords:
lat, lng = pc_coords[detail_postcode]
postcode = inferred_postcode = detail_postcode
postcode_source = "detail_address"
# (C) Full postcode in the search-card address -> geocode it.
if lat is None and extracted_postcode and extracted_postcode in pc_coords:
lat, lng = pc_coords[extracted_postcode]
postcode = extracted_postcode
postcode_source = "address"
# (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
if lat is None: if lat is None:
# Try outcode-level fallback from address text
addr_outcode = _extract_outcode(address) addr_outcode = _extract_outcode(address)
if addr_outcode: if addr_outcode:
result = _resolve_outcode_coords(addr_outcode, pc_coords) result = _resolve_outcode_coords(addr_outcode, pc_coords)
@ -1138,7 +1428,6 @@ def transform_property(
postcode, lat, lng = result postcode, lat, lng = result
postcode_source = "address_outcode" postcode_source = "address_outcode"
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode: if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords) result = _resolve_outcode_coords(search_outcode, pc_coords)
if result: if result:
@ -1188,9 +1477,17 @@ def transform_property(
"Postcode": postcode, "Postcode": postcode,
"Postcode source": postcode_source or "unknown", "Postcode source": postcode_source or "unknown",
"Extracted postcode": extracted_postcode, "Extracted postcode": extracted_postcode,
"Inferred postcode": postcode if postcode_source != "address" else None, "Inferred postcode": (
"Listing raw address": address, inferred_postcode
"Address per Property Register": clean_listing_address(address), if inferred_postcode is not None
else (postcode if postcode_source != "address" else None)
),
"Listing raw address": detail_full_address or address,
"Address per Property Register": build_register_address(
detail_full_address or address, detail_number_or_name
),
"UPRN": detail_uprn,
"Property number or name": detail_number_or_name,
"Leasehold/Freehold": raw.get("tenure") or None, "Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")), "Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")), "Property sub-type": normalize_sub_type(raw.get("property_type")),
@ -1215,6 +1512,9 @@ def search_outcode(
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]], pc_coords: dict[str, tuple[float, float]],
max_properties: int | None = None, max_properties: int | None = None,
detail_page=None,
detail_cap: int = 0,
detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]: ) -> tuple[list[dict], str | None]:
"""Search Zoopla for properties in one outcode. """Search Zoopla for properties in one outcode.
@ -1222,6 +1522,12 @@ def search_outcode(
search flow, extracts listings from rendered DOM, and transforms to the search flow, extracts listings from rendered DOM, and transforms to the
standard output schema. standard output schema.
When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
are supplied, up to ``detail_cap`` listings per outcode have their detail
page fetched for a precise postcode (see ``_fetch_listing_detail``).
``detail_budget_seconds`` caps the wall-clock time spent fetching details so
the per-outcode timeout that also guards search pagination is never starved.
Returns (properties, search_url). Returns (properties, search_url).
Raises TurnstileError if Cloudflare blocks us mid-session. Raises TurnstileError if Cloudflare blocks us mid-session.
@ -1231,12 +1537,25 @@ def search_outcode(
total_results = _get_result_count(page) total_results = _get_result_count(page)
fetch_detail = None
detail_deadline = None
if detail_page is not None and detail_cap > 0:
fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731
detail_page, url, search_outcode=outcode
)
if detail_budget_seconds is not None:
detail_deadline = time.monotonic() + detail_budget_seconds
# Always try extraction even if result count is 0 — the count regex may # Always try extraction even if result count is 0 — the count regex may
# not match Zoopla's current text format, but listings may still be in DOM # not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate( raw_listings = _paginate(
page, page,
total_results, total_results,
max_properties=max_properties, max_properties=max_properties,
fetch_detail=fetch_detail,
detail_cap=detail_cap,
detail_state={"fetched": 0},
detail_deadline=detail_deadline,
) )
if not raw_listings: if not raw_listings:
if total_results > 0: if total_results > 0:
@ -1252,7 +1571,11 @@ def search_outcode(
for raw in raw_listings: for raw in raw_listings:
try: try:
transformed = transform_property( transformed = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode raw,
pc_index,
pc_coords,
search_outcode=outcode,
detail=raw.get("_detail"),
) )
except Exception as exc: except Exception as exc:
log.warning( log.warning(

View file

@ -0,0 +1,164 @@
"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
still contains the React Server Components flight stream so the existing pure
parsers work unchanged:
- the search page yields the outcode's listing detail URLs, and
- each detail page's flight stream carries the property's location object
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
listing fields (price/beds/baths/tenure/floor area) parsed here.
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
FlareSolverr session solves the SW9 search + detail pages and the flight data
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
"""
import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
from flaresolverr import FlareSolverrError, FlareSolverrSession
from spatial import PostcodeSpatialIndex
from zoopla import _url_with_page, parse_detail_geo, transform_property
log = logging.getLogger("zoopla")
# Safety bound on how many search-result pages to walk per outcode.
_MAX_SERP_PAGES = 60
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
def _int(pattern: str, buf: str) -> int | None:
match = re.search(pattern, buf)
return int(match.group(1)) if match else None
def parse_detail_listing(html: str) -> dict:
"""Extract the non-location listing fields from a Zoopla detail page.
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
detail page's flight stream (validated against real Zoopla detail HTML).
All fields are best-effort; missing ones default to None so a listing with
a known location is still emitted."""
buf = html.replace('\\"', '"').replace("\\/", "/")
price = _int(r'"internalValue":(\d+)', buf)
if price is None:
price = _int(r'"priceUnformatted":(\d+)', buf)
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
tenure = tenure_match.group(1).title() if tenure_match else None
# Address + property type come from the page <title>, e.g.
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
address = None
property_type = None
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
if title_match:
title = title_match.group(1)
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
if addr_match:
address = addr_match.group(1).strip()
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
if type_match:
property_type = type_match.group(1).strip()
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
if explicit_type:
property_type = explicit_type.group(1)
return {
"price": price,
"beds": _int(r'"numBedrooms":(\d+)', buf),
"baths": _int(r'"numBaths":(\d+)', buf),
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
"tenure": tenure,
"property_type": property_type,
"address": address,
}
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
"""Walk the outcode's search-result pages and collect listing detail paths."""
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
seen: list[str] = []
seen_ids: set[str] = set()
for page_num in range(1, _MAX_SERP_PAGES + 1):
url = base if page_num == 1 else _url_with_page(base, page_num)
html = fs.get(url)
new = 0
for path in _DETAIL_PATH_RE.findall(html):
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
if listing_id in seen_ids:
continue
seen_ids.add(listing_id)
seen.append(path)
new += 1
if limit is not None and len(seen) >= limit:
return seen
if new == 0:
break
time.sleep(DELAY_BETWEEN_PAGES)
return seen
def search_outcode(
outcode: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
fs: FlareSolverrSession,
max_properties: int | None = None,
detail_cap: int = 0,
detail_budget_seconds: float | None = None,
) -> tuple[list[dict], str | None]:
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
Every listing's detail page is fetched (that is where the postcode lives),
so the effective listing count is bounded by both ``max_properties`` and
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
limit = detail_cap if detail_cap and detail_cap > 0 else None
if max_properties is not None:
limit = max_properties if limit is None else min(limit, max_properties)
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
paths = _enumerate_detail_paths(fs, outcode, limit)
if not paths:
return [], base
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
properties: list[dict] = []
dropped = 0
for path in paths:
if deadline is not None and time.monotonic() >= deadline:
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
break
id_match = _LISTING_ID_RE.search(path)
listing_id = id_match.group(1) if id_match else path
try:
html = fs.get(ZOOPLA_BASE + path)
geo = parse_detail_geo(html, search_outcode=outcode)
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
prop = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
)
except FlareSolverrError as exc:
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
prop = None
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
prop = None
if prop:
properties.append(prop)
else:
dropped += 1
time.sleep(DELAY_BETWEEN_PAGES)
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
return properties, base

View file

@ -606,12 +606,13 @@ function OverlayTileLayers({
const showTrees = activeOverlays.has('trees-outside-woodlands'); const showTrees = activeOverlays.has('trees-outside-woodlands');
const showPropertyBorders = activeOverlays.has('property-borders'); const showPropertyBorders = activeOverlays.has('property-borders');
// Restrict the heatmap to the selected crime types. When every type is // Restrict the heatmap to the selected crime types. This must always be a
// selected we omit the filter entirely so all features contribute. // concrete expression: passing `filter={undefined}` makes react-map-gl call
const crimeFilter = // map.addLayer({filter: undefined}), which MapLibre rejects at validation
activeCrimeTypes.size >= CRIME_TYPE_VALUES.length // ("filter: array expected, undefined found"), so the layer is never created
? undefined // and the heatmap stays blank until a later setFilter call. An `in` over the
: ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]]; // selected types matches everything when all 14 are selected.
const crimeFilter = ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]];
return ( return (
<> <>

View file

@ -0,0 +1,107 @@
import { cleanup, fireEvent, render, screen } from '@testing-library/react';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import MobileDrawer from './MobileDrawer';
vi.mock('react-i18next', () => ({
useTranslation: () => ({
t: (key: string) => key,
}),
}));
const originalSetPointerCapture = HTMLElement.prototype.setPointerCapture;
function renderDrawer(onClose = vi.fn()) {
const view = render(
<MobileDrawer
onClose={onClose}
renderArea={() => <div>Area content</div>}
renderProperties={() => <div>Properties content</div>}
tab="area"
onTabChange={vi.fn()}
/>
);
const handle = view.container.querySelector('[data-mobile-drawer-drag-handle]');
const root = view.container.querySelector('[data-tutorial="right-pane"]');
const panel = view.container.querySelector('[data-tutorial="right-pane"] > div:last-child');
if (!(handle instanceof HTMLElement)) throw new Error('Expected drawer drag handle');
if (!(root instanceof HTMLElement)) throw new Error('Expected drawer root');
if (!(panel instanceof HTMLElement)) throw new Error('Expected drawer panel');
return { ...view, handle, onClose, panel, root };
}
describe('MobileDrawer', () => {
beforeEach(() => {
HTMLElement.prototype.setPointerCapture = vi.fn();
});
afterEach(() => {
cleanup();
Object.defineProperty(HTMLElement.prototype, 'setPointerCapture', {
configurable: true,
value: originalSetPointerCapture,
});
});
it('lowers and stays open when swiped down from the handle', () => {
const { handle, onClose, panel } = renderDrawer();
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
expect(onClose).not.toHaveBeenCalled();
expect(panel.style.transform).toBe('translateY(110px)');
});
it('can be raised again after being lowered', () => {
const { handle, onClose, panel } = renderDrawer();
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
fireEvent.pointerDown(handle, { pointerId: 2, clientY: 230 });
fireEvent.pointerMove(handle, { pointerId: 2, clientY: 170 });
fireEvent.pointerUp(handle, { pointerId: 2, clientY: 170 });
expect(onClose).not.toHaveBeenCalled();
expect(panel.style.transform).toBe('translateY(50px)');
});
it('keeps the close control reachable when dragged down far', () => {
const { handle, panel } = renderDrawer();
Object.defineProperty(panel, 'offsetHeight', {
configurable: true,
value: 200,
});
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 420 });
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 420 });
expect(panel.style.transform).toBe('translateY(96px)');
});
it('leaves the rest of the mobile map usable while the panel is open', () => {
const { panel, root } = renderDrawer();
const spacer = root.firstElementChild;
if (!(spacer instanceof HTMLElement)) throw new Error('Expected drawer spacer');
expect(root.className).toContain('pointer-events-none');
expect(panel.className).toContain('pointer-events-auto');
expect(spacer.className).not.toContain('bg-black');
});
it('closes from the close button', () => {
const { onClose } = renderDrawer();
fireEvent.click(screen.getByLabelText('mobileDrawer.closeDrawer'));
expect(onClose).toHaveBeenCalledTimes(1);
});
});

View file

@ -0,0 +1,11 @@
export const DEFAULT_COLOR_OPACITY = 1;
export const MIN_COLOR_OPACITY = 0.1;
export function normalizeColorOpacity(value: number | null | undefined): number {
if (value == null || !Number.isFinite(value)) return DEFAULT_COLOR_OPACITY;
return Math.min(1, Math.max(MIN_COLOR_OPACITY, value));
}
export function colorOpacityToPercent(value: number): number {
return Math.round(normalizeColorOpacity(value) * 100);
}

View file

@ -0,0 +1,35 @@
// Street-crime categories carried by the `crime_hotspots` vector tiles in the
// `crime_type` feature property. The `value` strings must match the police.uk
// "Crime type" values exactly (see pipeline/transform/crime_hotspot_tiles.py),
// because they are used directly in the MapLibre heatmap `filter` expression.
// `label` is a shorter, human-friendly name for the overlay-selector checkboxes.
export interface CrimeTypeDef {
value: string;
label: string;
}
export const CRIME_TYPES: readonly CrimeTypeDef[] = [
{ value: 'Violence and sexual offences', label: 'Violence & sexual offences' },
{ value: 'Anti-social behaviour', label: 'Anti-social behaviour' },
{ value: 'Criminal damage and arson', label: 'Criminal damage & arson' },
{ value: 'Public order', label: 'Public order' },
{ value: 'Shoplifting', label: 'Shoplifting' },
{ value: 'Vehicle crime', label: 'Vehicle crime' },
{ value: 'Burglary', label: 'Burglary' },
{ value: 'Other theft', label: 'Other theft' },
{ value: 'Theft from the person', label: 'Theft from the person' },
{ value: 'Bicycle theft', label: 'Bicycle theft' },
{ value: 'Drugs', label: 'Drugs' },
{ value: 'Robbery', label: 'Robbery' },
{ value: 'Possession of weapons', label: 'Possession of weapons' },
{ value: 'Other crime', label: 'Other crime' },
] as const;
export const CRIME_TYPE_VALUES: readonly string[] = CRIME_TYPES.map((c) => c.value);
const CRIME_TYPE_VALUE_SET = new Set<string>(CRIME_TYPE_VALUES);
export function isCrimeTypeValue(value: string): boolean {
return CRIME_TYPE_VALUE_SET.has(value);
}

View file

@ -4,7 +4,10 @@ Downloads GML files for all local authorities from the INSPIRE download page.
Each ZIP contains a GML file with title extent polygons for that authority. Each ZIP contains a GML file with title extent polygons for that authority.
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
License: INSPIRE End User Licence License: Open Government Licence v3.0 (since 1 July 2020, under the PSGA).
Requires HM Land Registry + Ordnance Survey (AC0000851063) attribution; see
the conditions page at the source URL. Boundaries are indicative "general
boundaries", not the legal extent of title.
""" """
import argparse import argparse

View file

@ -0,0 +1,505 @@
"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
British National Grid. There is no public imagery tile service, so we mirror the
Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
an area of interest, pick the best RGB capture per OS tile, download and decode the
ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
archive that the server stacks *over* the Sentinel-2 base where coverage exists.
ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
"""
from __future__ import annotations
import argparse
import json
import re
import shutil
import sqlite3
import subprocess
import tempfile
import urllib.error
import urllib.request
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from pipeline.download.tiles import ensure_pmtiles_cli
from pipeline.local_temp import local_tmp_dir
# Defra Data Services Platform survey download API (reverse-engineered from the
# environment.data.gov.uk/survey front-end; no official API is documented).
SEARCH_URL = (
"https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
)
SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
# Static public key baked into the survey page JS. May rotate -- we try to scrape a
# fresh one from the page and only fall back to this literal.
DEFAULT_SUBSCRIPTION_KEY = "dspui"
SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
# Greater London bounding box (lon/lat). The API only returns tiles where coverage
# exists, so a generous bbox is fine -- it does not force blank downloads.
DEFAULT_AOI: dict = {
"type": "Polygon",
"coordinates": [
[
[-0.55, 51.25],
[0.30, 51.25],
[0.30, 51.70],
[-0.55, 51.70],
[-0.55, 51.25],
]
],
}
DEFAULT_MIN_ZOOM = 14
DEFAULT_MAX_ZOOM = 19
# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
USER_AGENT = "perfect-postcode-satellite-highres/1.0"
ATTRIBUTION_TEMPLATE = (
"Environment Agency Vertical Aerial Photography - "
"© Environment Agency copyright and/or database right {year}. "
"All rights reserved. Licensed under the Open Government Licence v3.0."
)
@dataclass(frozen=True)
class VapTile:
"""One survey download record from the Defra search API."""
product_id: str
year: int
resolution_m: float
os_tile_id: str
uri: str
label: str
def parse_search_results(payload: dict) -> list[VapTile]:
"""Turn a raw search-API JSON payload into typed records."""
tiles: list[VapTile] = []
for result in payload.get("results", []):
try:
tiles.append(
VapTile(
product_id=result["product"]["id"],
year=int(result["year"]["id"]),
resolution_m=float(result["resolution"]["id"]),
os_tile_id=result["tile"]["id"],
uri=result["uri"],
label=result.get("label", ""),
)
)
except (KeyError, TypeError, ValueError):
# Skip malformed records rather than failing the whole search.
continue
return tiles
def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
"""Pick one RGB capture per OS tile: finest resolution, then latest year.
Pure function -- the unit test exercises this against a real-shaped payload.
"""
best: dict[str, VapTile] = {}
for tile in tiles:
if tile.product_id != VAP_RGB_PRODUCT:
continue
current = best.get(tile.os_tile_id)
if current is None or _is_better(tile, current):
best[tile.os_tile_id] = tile
return [best[key] for key in sorted(best)]
def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
"""Finer resolution wins; ties broken by the most recent survey year."""
if candidate.resolution_m != incumbent.resolution_m:
return candidate.resolution_m < incumbent.resolution_m
return candidate.year > incumbent.year
def _http_get(url: str, timeout: float) -> bytes:
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read()
def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
"""Use an explicit key, else scrape the survey page JS, else the known default."""
if explicit:
return explicit
try:
page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
match = SUBSCRIPTION_KEY_RE.search(page)
if match:
return match.group(1)
# The key usually lives in a referenced JS chunk; scan the largest one.
for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
if match:
return match.group(1)
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
print(f"Could not scrape subscription key ({err}); using default", flush=True)
return DEFAULT_SUBSCRIPTION_KEY
def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
"""POST the area-of-interest polygon and return the RGB tiles to download."""
body = json.dumps(aoi).encode("utf-8")
req = urllib.request.Request(
SEARCH_URL,
data=body,
headers={
"Content-Type": "application/geo+json",
"Referer": SURVEY_PAGE_URL,
"User-Agent": USER_AGENT,
},
method="POST",
)
with urllib.request.urlopen(req, timeout=timeout) as response:
payload = json.load(response)
selected = select_best_rgb_tiles(parse_search_results(payload))
print(
f"Search returned {payload.get('count', 0)} records; "
f"selected {len(selected)} RGB tile(s)",
flush=True,
)
return selected
def _download_and_extract(
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
) -> list[Path]:
"""Download one survey zip and extract its ECW raster(s)."""
url = f"{tile.uri}?subscription-key={key}"
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
last_error: Exception | None = None
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(
urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
timeout=timeout,
) as response, zip_path.open("wb") as out:
shutil.copyfileobj(response, out, length=1 << 20)
break
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
last_error = err
if attempt == retries:
raise RuntimeError(f"Failed to download {url}: {err}") from err
extracted: list[Path] = []
with zipfile.ZipFile(zip_path) as archive:
for member in archive.infolist():
if member.is_dir() or not member.filename.lower().endswith(".ecw"):
continue
target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
with archive.open(member) as src, target.open("wb") as dst:
shutil.copyfileobj(src, dst, length=1 << 20)
extracted.append(target)
zip_path.unlink(missing_ok=True)
if not extracted:
print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
return extracted
def download_tiles(
tiles: list[VapTile],
ecw_dir: Path,
key: str,
max_workers: int,
timeout: float,
retries: int,
) -> list[Path]:
"""Download every selected tile concurrently; return all extracted ECW paths."""
ecw_dir.mkdir(parents=True, exist_ok=True)
ecw_paths: list[Path] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(
_download_and_extract, tile, ecw_dir, key, timeout, retries
): tile
for tile in tiles
}
done = 0
for future in as_completed(futures):
tile = futures[future]
ecw_paths.extend(future.result())
done += 1
print(
f"Downloaded {done}/{len(tiles)} tiles "
f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
flush=True,
)
return ecw_paths
def _build_tiles_with_gdal(
work_dir: Path,
gdal_image: str,
min_zoom: int,
max_zoom: int,
jobs: int,
webp_quality: int,
) -> Path:
"""Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
shows through them.
"""
xyz_dir = work_dir / "xyz"
# EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
# so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
# exceed PNG's 4-band limit). We then:
# * force EPSG:27700 -- the pixels are already British National Grid, and the
# EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
# for metre-accurate reprojection to Web Mercator;
# * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
# gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
# Sentinel-2 base show through wherever VAP coverage is missing.
script = (
"set -euo pipefail; "
"cd /work; "
"gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
"gdal_edit.py -a_srs EPSG:27700 "
"-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
"-colorinterp_4 alpha mosaic.vrt; "
f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
f"--processes={jobs} --resampling=average --webviewer=none "
f"--tiledriver=WEBP --webp-quality={webp_quality} "
"mosaic.vrt xyz"
)
subprocess.run(
[
"docker",
"run",
"--rm",
"-v",
f"{work_dir.resolve()}:/work",
gdal_image,
"bash",
"-c",
script,
],
check=True,
)
if not xyz_dir.exists():
raise RuntimeError("gdal2tiles produced no output directory")
return xyz_dir
def _pack_xyz_to_mbtiles(
xyz_dir: Path,
mbtiles_path: Path,
bounds: tuple[float, float, float, float],
min_zoom: int,
max_zoom: int,
attribution: str,
) -> int:
"""Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
if mbtiles_path.exists():
mbtiles_path.unlink()
conn = sqlite3.connect(mbtiles_path)
try:
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
conn.execute(
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
"tile_row INTEGER, tile_data BLOB)"
)
conn.execute(
"CREATE UNIQUE INDEX tile_index ON tiles "
"(zoom_level, tile_column, tile_row)"
)
conn.executemany(
"INSERT INTO metadata (name, value) VALUES (?, ?)",
[
("name", "EA Vertical Aerial Photography"),
("type", "overlay"),
("version", "1"),
("description", "Environment Agency high-resolution aerial imagery"),
("format", "webp"),
("attribution", attribution),
("bounds", ",".join(f"{value:.6f}" for value in bounds)),
("minzoom", str(min_zoom)),
("maxzoom", str(max_zoom)),
],
)
inserted = 0
for zoom_dir in sorted(xyz_dir.iterdir()):
if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
continue
zoom = int(zoom_dir.name)
for col_dir in zoom_dir.iterdir():
if not col_dir.is_dir() or not col_dir.name.isdigit():
continue
col = int(col_dir.name)
for tile_file in col_dir.glob("*.webp"):
if not tile_file.stem.isdigit():
continue
row = int(tile_file.stem)
tms_row = (1 << zoom) - 1 - row
conn.execute(
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
(zoom, col, tms_row, tile_file.read_bytes()),
)
inserted += 1
if inserted % 5000 == 0:
conn.commit()
print(f" packed {inserted:,} tiles", flush=True)
conn.commit()
finally:
conn.close()
return inserted
def build_satellite_highres_tiles(
output_path: Path,
pmtiles_bin: Path,
pmtiles_version: str,
aoi: dict,
min_zoom: int,
max_zoom: int,
gdal_image: str,
subscription_key: str | None,
max_workers: int,
timeout: float,
retries: int,
jobs: int,
webp_quality: int,
) -> None:
if min_zoom > max_zoom:
raise ValueError("--min-zoom must be <= --max-zoom")
output_path.parent.mkdir(parents=True, exist_ok=True)
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
tiles = search_vap_tiles(aoi)
if not tiles:
raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
key = resolve_subscription_key(subscription_key)
attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
work_dir = Path(tmp)
ecw_dir = work_dir / "ecw"
ecw_paths = download_tiles(
tiles, ecw_dir, key, max_workers, timeout, retries
)
if not ecw_paths:
raise RuntimeError("No ECW rasters were extracted from the downloads")
xyz_dir = _build_tiles_with_gdal(
work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
)
mbtiles_path = work_dir / "satellite_highres.mbtiles"
bounds = _aoi_bounds(aoi)
inserted = _pack_xyz_to_mbtiles(
xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
)
if inserted == 0:
raise RuntimeError("Tiling produced no tiles to pack")
print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
subprocess.run(
[str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
check=True,
)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
coords = [point for ring in aoi["coordinates"] for point in ring]
lons = [point[0] for point in coords]
lats = [point[1] for point in coords]
return min(lons), min(lats), max(lons), max(lats)
def _load_aoi(path: Path | None) -> dict:
if path is None:
return DEFAULT_AOI
data = json.loads(path.read_text())
if data.get("type") == "FeatureCollection":
return data["features"][0]["geometry"]
if data.get("type") == "Feature":
return data["geometry"]
return data
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
parser.add_argument("--pmtiles-version", default="1.22.3")
parser.add_argument(
"--aoi-geojson",
type=Path,
default=None,
help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
"(default: Greater London)",
)
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
parser.add_argument(
"--gdal-image",
default=DEFAULT_GDAL_IMAGE,
help="Docker image with a GDAL that has the ECW driver",
)
parser.add_argument(
"--subscription-key",
default=None,
help="Override the Defra survey API key (default: scrape, then 'dspui')",
)
parser.add_argument("--max-workers", type=int, default=4)
parser.add_argument("--timeout", type=float, default=600.0)
parser.add_argument("--retries", type=int, default=3)
parser.add_argument(
"--jobs",
type=int,
default=8,
help="Parallel processes for gdal2tiles",
)
parser.add_argument(
"--webp-quality",
type=int,
default=85,
help="WebP tile quality (1-100); lower is smaller",
)
args = parser.parse_args()
build_satellite_highres_tiles(
output_path=args.output,
pmtiles_bin=args.pmtiles_bin,
pmtiles_version=args.pmtiles_version,
aoi=_load_aoi(args.aoi_geojson),
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
gdal_image=args.gdal_image,
subscription_key=args.subscription_key,
max_workers=max(1, args.max_workers),
timeout=args.timeout,
retries=max(0, args.retries),
jobs=max(1, args.jobs),
webp_quality=args.webp_quality,
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,97 @@
from pipeline.download import satellite_highres
from pipeline.download.satellite_highres import (
VapTile,
parse_search_results,
select_best_rgb_tiles,
)
def _result(product: str, year: str, resolution: str, tile: str) -> dict:
"""One search-API record in the real response shape."""
return {
"product": {"id": product, "label": product},
"year": {"id": year, "label": year},
"resolution": {"id": resolution, "label": f"{resolution}m"},
"tile": {"id": tile, "label": tile},
"label": f"{product}-{year}-{resolution}m-{tile}",
"uri": (
"https://environment.data.gov.uk/tiles/collections/survey/"
f"{product}/{year}/{resolution}/{tile}"
),
}
# Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011),
# plus Night Time and LIDAR products that must be ignored.
SAMPLE_PAYLOAD = {
"count": 6,
"results": [
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
_result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"),
_result("lidar_composite_dtm", "2022", "1", "TQ2575"),
# TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer.
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"),
_result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"),
_result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"),
],
}
def test_parse_search_results_skips_malformed_records() -> None:
payload = {
"results": [
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
{"product": {"id": "broken"}}, # missing year/resolution/tile/uri
]
}
tiles = parse_search_results(payload)
assert len(tiles) == 1
assert tiles[0] == VapTile(
product_id="vertical_aerial_photography_tiles_rgb",
year=2008,
resolution_m=0.4,
os_tile_id="TQ2575",
uri="https://environment.data.gov.uk/tiles/collections/survey/"
"vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575",
label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575",
)
def test_select_best_rgb_filters_non_rgb_products() -> None:
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
assert {tile.product_id for tile in selected} == {
satellite_highres.VAP_RGB_PRODUCT
}
def test_select_best_rgb_one_tile_per_os_square() -> None:
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"]
def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None:
selected = {
tile.os_tile_id: tile
for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
}
# TQ2575: only one RGB capture.
assert selected["TQ2575"].resolution_m == 0.4
# TQ3080: finest resolution (0.1m) wins even though it is the older survey.
assert selected["TQ3080"].resolution_m == 0.1
assert selected["TQ3080"].year == 2008
def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
tiles = [
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"),
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"),
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"),
]
selected = select_best_rgb_tiles(tiles)
assert len(selected) == 1
assert selected[0].year == 2018
def test_select_best_rgb_empty_when_no_rgb() -> None:
payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
assert select_best_rgb_tiles(parse_search_results(payload)) == []

View file

@ -1,12 +1,25 @@
from __future__ import annotations from __future__ import annotations
import zipfile import zipfile
import json
import polars as pl import polars as pl
from pipeline.validate_outputs import main from pipeline.validate_outputs import main
def write_boundary(path, postcodes):
units = path / "units"
units.mkdir(parents=True)
features = [
{"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None}
for postcode in postcodes
]
(units / "AA1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": features})
)
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch): def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
parquet_path = tmp_path / "data.parquet" parquet_path = tmp_path / "data.parquet"
file_path = tmp_path / "plain.txt" file_path = tmp_path / "plain.txt"
@ -59,3 +72,42 @@ def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
assert "empty file" in stderr assert "empty file" in stderr
assert "missing" in stderr assert "missing" in stderr
assert "no files matched" in stderr assert "no files matched" in stderr
def test_validates_postcode_boundary_matches(tmp_path, monkeypatch):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 0
def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "missing boundaries" in stderr
assert "boundary postcodes are absent" in stderr

View file

@ -0,0 +1,358 @@
"""Aggregate police.uk street crime to postcodes by 50m spatial proximity.
Instead of attributing each incident to its published LSOA code, this transform
counts the anonymised incident *points* that fall within 50m of each postcode's
boundary polygon (the polygon buffered outward by 50m). A point inside several
overlapping buffers counts for each postcode -- the same multiplicity the
tree-density filter uses for features near more than one postcode.
The metric is a raw annualised count ("incidents/year within 50m"); there is no
per-capita denominator. Outputs mirror the old LSOA transform's shape but are
keyed on ``postcode`` instead of ``LSOA code``:
* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
points", not true locations, and a share of rows have no coordinate at all
(dropped here). Spatial totals are therefore lower than, and fuzzier than, the
old LSOA-tagged counts -- by design, not a regression.
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
import numpy as np
import polars as pl
import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
)
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
# Serious types first so column order is stable and self-documenting.
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
DEFAULT_BUFFER_M = 50.0
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
# Generous GB bounds; points outside fall in no English postcode anyway, but
# filtering first keeps the WGS84->BNG transform out of its undefined region.
LON_BOUNDS = (-9.5, 2.5)
LAT_BOUNDS = (49.0, 61.5)
# Read CSVs in chunks of files to bound peak memory while keeping the STRtree
# query vectorised over a useful number of points.
_CSV_BATCH = 64
def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
"""Derive annualisation denominators from the monthly directory names.
Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
month's incidents, so the set of month directories is the set of observed
months. Returns the sorted distinct years, months-observed-per-year, and the
total month count (the avg/yr denominator).
"""
months = sorted(
{path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
)
if not months:
raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
months_in_year: dict[int, int] = {}
for month in months:
year = int(month[:4])
months_in_year[year] = months_in_year.get(year, 0) + 1
years = sorted(months_in_year)
return years, months_in_year, len(months)
def _build_tree(
polygons: np.ndarray, buffer_m: float
) -> tuple[np.ndarray, shapely.STRtree]:
"""Buffer postcode polygons outward by ``buffer_m`` and index them.
Buffer index == postcode index. Geometries that fail to buffer are replaced
with an empty polygon so the index stays aligned; they simply never match.
"""
buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
if broken.any():
print(f" {int(broken.sum()):,} postcode buffers unusable; left empty")
buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
return buffers, shapely.STRtree(buffers)
def _accumulate_counts(
csvs: list[Path],
tree: shapely.STRtree,
type_to_idx: dict[str, int],
year_to_idx: dict[int, int],
transformer: Transformer,
counts: np.ndarray,
) -> None:
"""Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
schema = {
"Longitude": pl.Float64,
"Latitude": pl.Float64,
"Month": pl.Utf8,
"Crime type": pl.Utf8,
}
known_types = list(type_to_idx)
total_points = 0
total_matches = 0
total_dropped = 0
for start in range(0, len(csvs), _CSV_BATCH):
batch = csvs[start : start + _CSV_BATCH]
frame = (
pl.scan_csv(
batch,
schema_overrides=schema,
ignore_errors=True,
)
.select("Longitude", "Latitude", "Month", "Crime type")
.with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
.filter(
pl.col("Longitude").is_not_null()
& pl.col("Latitude").is_not_null()
& pl.col("Longitude").is_between(*LON_BOUNDS)
& pl.col("Latitude").is_between(*LAT_BOUNDS)
& pl.col("Crime type").is_in(known_types)
& pl.col("year").is_in(list(year_to_idx))
)
.with_columns(
pl.col("Crime type")
.replace_strict(type_to_idx, return_dtype=pl.Int32)
.alias("tidx"),
pl.col("year")
.replace_strict(year_to_idx, return_dtype=pl.Int32)
.alias("yidx"),
)
.select("Longitude", "Latitude", "tidx", "yidx")
.collect(engine="streaming")
)
rows_in = frame.height
if rows_in == 0:
continue
lon = frame["Longitude"].to_numpy()
lat = frame["Latitude"].to_numpy()
tidx = frame["tidx"].to_numpy()
yidx = frame["yidx"].to_numpy()
x, y = transformer.transform(lon, lat)
finite = np.isfinite(x) & np.isfinite(y)
total_dropped += int((~finite).sum())
if not finite.any():
continue
x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
total_points += x.size
points = shapely.points(x, y)
point_index, postcode_index = tree.query(points, predicate="intersects")
if point_index.size:
np.add.at(
counts,
(postcode_index, tidx[point_index], yidx[point_index]),
1,
)
total_matches += point_index.size
print(
f" files {start + len(batch):,}/{len(csvs):,}: "
f"{total_points:,} located points, {total_matches:,} postcode matches"
)
if total_dropped:
print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
def _rollup_long(
long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
) -> pl.DataFrame:
"""Sum per-year annualised counts across ``types`` into a single rollup."""
return (
long.filter(pl.col("Crime type").is_in(list(types)))
.group_by("postcode", "year")
.agg(pl.col("count").sum().round(1).alias("count"))
.with_columns(pl.lit(rollup_name).alias("Crime type"))
.select("postcode", "Crime type", "year", "count")
)
def _write_avg_yr(
postcodes: np.ndarray,
counts: np.ndarray,
valid_month_count: int,
output_path: Path,
) -> None:
"""Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
totals = counts.sum(axis=2) # (n_postcodes, n_types)
avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)
data: dict[str, np.ndarray] = {"postcode": postcodes}
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime averages: {output_path}")
def _write_by_year(
postcodes: np.ndarray,
counts: np.ndarray,
years: list[int],
months_in_year: dict[int, int],
output_path: Path,
) -> None:
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)
pc_i, ty_i, yr_i = np.nonzero(counts)
if pc_i.size == 0:
raise ValueError("No crime points matched any postcode buffer")
type_names = np.array(ALL_CRIME_TYPES, dtype=object)
year_values = np.array(years, dtype=np.int32)
long = pl.DataFrame(
{
"postcode": postcodes[pc_i],
"Crime type": type_names[ty_i],
"year": year_values[yr_i],
"count": annual[pc_i, ty_i, yr_i].astype(np.float32),
}
)
serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
combined = pl.concat([long, serious, minor])
by_type = (
combined.sort("year")
.group_by("postcode", "Crime type")
.agg(pl.struct("year", "count").alias("series"))
)
wide = by_type.pivot(on="Crime type", index="postcode", values="series")
type_cols = [c for c in wide.columns if c != "postcode"]
wide = wide.rename({col: f"{col} (by year)" for col in type_cols})
output_path.parent.mkdir(parents=True, exist_ok=True)
wide.write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}")
def transform_crime_spatial(
crime_dir: Path,
boundaries_dir: Path,
output_path: Path,
by_year_output_path: Path,
buffer_m: float = DEFAULT_BUFFER_M,
max_postcodes: int | None = None,
max_files: int | None = None,
) -> None:
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
if max_files is not None:
csvs = csvs[:max_files]
years, months_in_year, valid_month_count = _month_calendar(csvs)
print(
f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
f"({years[0]}-{years[-1]})"
+ (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
)
postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
_buffers, tree = _build_tree(polygons, buffer_m)
type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
year_to_idx = {year: idx for idx, year in enumerate(years)}
counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
_accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
_write_avg_yr(postcodes, counts, valid_month_count, output_path)
_write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)
def main() -> None:
parser = argparse.ArgumentParser(
description="Count police.uk crime points within 50m of each postcode boundary"
)
parser.add_argument(
"--input",
type=Path,
default=Path("property-data/crime"),
help="Directory containing police.uk street crime CSVs",
)
parser.add_argument(
"--boundaries",
type=Path,
default=Path("property-data/postcode_boundaries/units"),
help="Directory of per-district postcode boundary GeoJSONs",
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output parquet: postcode + '{type} (avg/yr)' columns",
)
parser.add_argument(
"--output-by-year",
type=Path,
required=True,
help="Output parquet: postcode + nested '{type} (by year)' columns",
)
parser.add_argument(
"--buffer-m",
type=float,
default=DEFAULT_BUFFER_M,
help="Outward buffer (metres) added to each postcode boundary",
)
parser.add_argument(
"--max-postcodes",
type=int,
default=None,
help="Testing only: process the first N postcodes",
)
parser.add_argument(
"--max-files",
type=int,
default=None,
help="Testing only: process the first N monthly CSV files",
)
args = parser.parse_args()
if args.buffer_m <= 0:
raise SystemExit("--buffer-m must be greater than zero")
transform_crime_spatial(
crime_dir=args.input,
boundaries_dir=args.boundaries,
output_path=args.output,
by_year_output_path=args.output_by_year,
buffer_m=args.buffer_m,
max_postcodes=args.max_postcodes,
max_files=args.max_files,
)
if __name__ == "__main__":
main()

View file

@ -26,6 +26,7 @@ MIN_PRICE = 50_000
EPC_SOURCE_COLUMNS = [ EPC_SOURCE_COLUMNS = [
"address", "address",
"postcode", "postcode",
"uprn",
"current_energy_rating", "current_energy_rating",
"potential_energy_rating", "potential_energy_rating",
"property_type", "property_type",
@ -57,6 +58,8 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
raw.select( raw.select(
_clean_string("address").alias("epc_address"), _clean_string("address").alias("epc_address"),
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"), _clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
# UPRN keys an exact listing->EPC join downstream (~99% populated).
_clean_string("uprn").alias("uprn"),
_clean_string("current_energy_rating") _clean_string("current_energy_rating")
.str.to_uppercase() .str.to_uppercase()
.alias("current_energy_rating"), .alias("current_energy_rating"),

View file

@ -48,7 +48,7 @@ _AREA_COLUMNS = [
"lon", "lon",
# Runtime provenance for deciding whether missing coordinates are skippable. # Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd", "ctry25cd",
# Keyed lookup for postcode-level side tables (e.g. crime time series). # Join key for LSOA-level side tables (e.g. median age).
"lsoa21", "lsoa21",
# Deprivation # Deprivation
"Income Score", "Income Score",
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
"Other crime (avg/yr)", "Other crime (avg/yr)",
"Serious crime (avg/yr)", "Serious crime (avg/yr)",
"Minor crime (avg/yr)", "Minor crime (avg/yr)",
"Serious crime per 1k residents (avg/yr)",
"Minor crime per 1k residents (avg/yr)",
# Amenities # Amenities
"Number of restaurants within 2km", "Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km", "Number of grocery shops and supermarkets within 2km",
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
"Other", "Other",
] ]
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"] _EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 # Listings are matched to EPC certificates and Price-Paid properties first by
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 # UPRN (exact) and otherwise by fuzzy street-address similarity within the same
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82 # postcode. A house number in the listing address is the strong disambiguator,
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96 # so a numbered listing may match on a lower street-similarity score than a
_PROPERTY_MATCH_MIN_MARGIN = 4.0 # number-less one (which must match the street almost exactly to be trusted).
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 _LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
_DIRECT_EPC_NEAREST_POSTCODES = 40
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = ( _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
("_direct_epc_address", pl.Utf8), ("_direct_epc_address", pl.Utf8),
("_direct_current_energy_rating", pl.Utf8), ("_direct_current_energy_rating", pl.Utf8),
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
("_direct_was_council_house", pl.Utf8), ("_direct_was_council_house", pl.Utf8),
("_direct_epc_match_status", pl.Utf8), ("_direct_epc_match_status", pl.Utf8),
("_direct_epc_match_score", pl.Float32), ("_direct_epc_match_score", pl.Float32),
("_direct_epc_match_margin", pl.Float32), ("_direct_epc_match_method", pl.Utf8),
) )
_DIRECT_EPC_RAW_COLUMN_MAP = { _DIRECT_EPC_RAW_COLUMN_MAP = {
"epc_address": "_direct_epc_address", "epc_address": "_direct_epc_address",
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
) )
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _address_score(query: str, candidate: str | None) -> int: def _address_score(query: str, candidate: str | None) -> int:
if not candidate: if not candidate:
return 0 return 0
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address)) return bool(address and _NUMBER_RE.search(address))
def _normalize_uprn(value: object) -> str | None:
"""Canonical UPRN string (digits only) or None.
UPRNs arrive as strings or ints from the scraper / EPC register; normalise
so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
silently mangled into a bogus all-digits key.
"""
if value is None:
return None
if isinstance(value, float):
if not value.is_integer():
return None
value = int(value)
digits = re.sub(r"\D", "", str(value))
return digits or None
def _best_listing_match(
listing_uprn: str | None,
query: str | None,
uprn_index: dict[str, dict],
bucket_candidates: list[dict],
addressed_fields: list[str],
) -> tuple[dict, float, str, str | None] | None:
"""Pick the best candidate for a listing.
Matching is, in order: (1) an exact UPRN equality against the global
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used a house number in the listing
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
threshold; a number-less address must match the street almost exactly.
``addressed_fields`` names the candidate columns to fuzzy-match against (a
candidate may carry both a register and an EPC address). Returns
``(candidate, score, method, matched_field)`` or None. ``method`` is
"uprn" or "address"; ``matched_field`` is the winning address column (or
None for a UPRN match).
"""
if listing_uprn:
hit = uprn_index.get(listing_uprn)
if hit is not None:
return hit, 100.0, "uprn", None
if not query:
return None
listing_has_numbers = _has_number(query)
best: dict | None = None
best_score = 0
best_field: str | None = None
for candidate in bucket_candidates:
for field in addressed_fields:
address = candidate.get(field)
if not address:
continue
if listing_has_numbers and not _numbers_compatible(query, address):
continue
score = _address_score(query, address)
if score > best_score:
best_score = score
best = candidate
best_field = field
if best is None:
return None
threshold = (
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if best_score < threshold:
return None
return best, float(best_score), "address", best_field
def _load_listings_for_merge( def _load_listings_for_merge(
listings_path: Path, arcgis_path: Path listings_path: Path, arcgis_path: Path
) -> pl.DataFrame: ) -> pl.DataFrame:
@ -908,6 +942,20 @@ def _load_listings_for_merge(
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx") raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
postcode_mapping = build_postcode_mapping(arcgis_path).lazy() postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
# UPRN is only present on scraped listings that carry it (Zoopla detail
# pages); tolerate its absence so older parquets and test fixtures still
# load. Digits-only so it compares equal to the EPC register's UPRN.
if "UPRN" in raw.collect_schema().names():
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
listing_uprn_expr = (
pl.when(uprn_digits.str.len_chars() > 0)
.then(uprn_digits)
.otherwise(None)
.alias("_listing_uprn")
)
else:
listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
# treats NaN as distinct from null and the downstream `latest_price / # treats NaN as distinct from null and the downstream `latest_price /
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
@ -936,12 +984,14 @@ def _load_listings_for_merge(
"postcode" "postcode"
), ),
pl.col("Address per Property Register").alias("pp_address"), pl.col("Address per Property Register").alias("pp_address"),
listing_uprn_expr,
*overlay, *overlay,
) )
.select( .select(
"_listing_idx", "_listing_idx",
"postcode", "postcode",
"pp_address", "pp_address",
"_listing_uprn",
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES], *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
) )
.collect(engine="streaming") .collect(engine="streaming")
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:
def _load_direct_epc_candidates( def _load_direct_epc_candidates(
epc_path: Path, epc_path: Path,
arcgis_path: Path,
listing_outcodes: list[str], listing_outcodes: list[str],
temp_dir: Path, temp_dir: Path,
) -> pl.DataFrame: ) -> pl.DataFrame:
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
"_direct_epc_match_postcode": pl.Utf8, "_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8, "_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8, "_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_east": pl.Float64, "_direct_epc_uprn": pl.Utf8,
"_direct_epc_north": pl.Float64,
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")}, **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
} }
if not listing_outcodes: if not listing_outcodes:
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
.with_columns(pl.lit("Yes").alias("_direct_was_council_house")) .with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
) )
arcgis = pl.scan_parquet(arcgis_path).select(
normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
pl.col("east1m").alias("_direct_epc_east"),
pl.col("north1m").alias("_direct_epc_north"),
)
return ( return (
epc_base.sort("inspection_date", descending=True) epc_base.sort("inspection_date", descending=True)
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode") .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
on=["_direct_epc_match_address", "_direct_epc_match_postcode"], on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
how="left", how="left",
) )
.join(arcgis, on="_direct_epc_match_postcode", how="left")
.with_columns( .with_columns(
_canonical_epc_property_type_expr().alias( _canonical_epc_property_type_expr().alias(
"_direct_epc_canonical_property_type" "_direct_epc_canonical_property_type"
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
.otherwise(None) .otherwise(None)
.alias("_direct_potential_energy_rating"), .alias("_direct_potential_energy_rating"),
pl.col("epc_address").alias("_direct_epc_address"), pl.col("epc_address").alias("_direct_epc_address"),
pl.col("uprn").alias("_direct_epc_uprn"),
pl.col("total_floor_area").alias("_direct_total_floor_area"), pl.col("total_floor_area").alias("_direct_total_floor_area"),
pl.col("number_habitable_rooms").alias( pl.col("number_habitable_rooms").alias(
"_direct_number_habitable_rooms" "_direct_number_habitable_rooms"
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
"_direct_epc_match_postcode", "_direct_epc_match_postcode",
"_direct_epc_outcode", "_direct_epc_outcode",
"_direct_epc_canonical_property_type", "_direct_epc_canonical_property_type",
"_direct_epc_east", "_direct_epc_uprn",
"_direct_epc_north",
"_direct_epc_address", "_direct_epc_address",
"_direct_current_energy_rating", "_direct_current_energy_rating",
"_direct_potential_energy_rating", "_direct_potential_energy_rating",
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame: def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
match = listings.with_columns( """Add the normalised address/postcode/outcode keys used to match listings.
Listings are matched to EPC certificates and properties by UPRN and by
fuzzy street address within their (now accurate, detail-page-sourced)
postcode never by coordinate proximity so no projected easting/northing
is computed here. `_listing_uprn` flows through from the loaded listings.
"""
return listings.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"), normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"), normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
).with_columns( ).with_columns(
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
.alias("_listing_outcode") .alias("_listing_outcode")
) )
if match.is_empty():
return match.with_columns(
pl.Series("_listing_east", [], dtype=pl.Float64),
pl.Series("_listing_north", [], dtype=pl.Float64),
)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
east, north = transformer.transform(
match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
)
return match.with_columns(
pl.Series("_listing_east", east, dtype=pl.Float64),
pl.Series("_listing_north", north, dtype=pl.Float64),
)
def _optional_lazy_col( def _optional_lazy_col(
schema: pl.Schema, column: str, dtype: pl.DataType schema: pl.Schema, column: str, dtype: pl.DataType
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
"_matched_postcode": pl.Utf8, "_matched_postcode": pl.Utf8,
"_matched_pp_address": pl.Utf8, "_matched_pp_address": pl.Utf8,
"_property_match_score": pl.Float32, "_property_match_score": pl.Float32,
"_property_match_address_score": pl.Int32, "_property_match_method": pl.Utf8,
"_property_match_margin": pl.Float32,
"_property_match_field": pl.Utf8, "_property_match_field": pl.Utf8,
} }
@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
pl.col("postcode").cast(pl.Utf8).alias("postcode"), pl.col("postcode").cast(pl.Utf8).alias("postcode"),
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"), pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
_optional_lazy_col(schema, "epc_address", pl.Utf8), _optional_lazy_col(schema, "epc_address", pl.Utf8),
_optional_lazy_col(schema, "pp_property_type", pl.Utf8), # UPRN keys the exact match; present once epc_pp is rebuilt with it.
_optional_lazy_col(schema, "duration", pl.Utf8), _optional_lazy_col(schema, "uprn", pl.Utf8),
_optional_lazy_col(schema, "total_floor_area", pl.Float64),
_optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
_optional_lazy_col(schema, "latest_price", pl.Int64),
) )
.with_row_index("_property_row") .with_row_index("_property_row")
.with_columns( .with_columns(
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
) )
def _property_candidates_by_postcode( def _index_candidates(
candidates: pl.DataFrame, candidates: pl.DataFrame, postcode_key: str, uprn_key: str
) -> dict[str, list[dict]]: ) -> tuple[dict[str, list[dict]], dict[str, dict]]:
"""Index candidate rows for matching, in a single pass over the frame.
Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
fuzzy street-address match; the UPRN index drives the exact match and is
postcode-independent, so it still resolves when a listing's postcode is
slightly off.
"""
buckets: dict[str, list[dict]] = {} buckets: dict[str, list[dict]] = {}
uprn_index: dict[str, dict] = {}
for row in candidates.iter_rows(named=True): for row in candidates.iter_rows(named=True):
postcode = row.get("_property_match_postcode") postcode = row.get(postcode_key)
if postcode: if postcode:
buckets.setdefault(postcode, []).append(row) buckets.setdefault(postcode, []).append(row)
return buckets uprn = _normalize_uprn(row.get(uprn_key))
if uprn and uprn not in uprn_index:
uprn_index[uprn] = row
return buckets, uprn_index
def _best_listing_property_candidate( def _best_listing_property_candidate(
listing: dict, candidates: list[dict] listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
) -> dict | None: ) -> dict | None:
query = listing.get("_listing_match_address") result = _best_listing_match(
if not query: listing.get("_listing_uprn"),
return None listing.get("_listing_match_address"),
uprn_index,
listing_has_numbers = _has_number(query) candidates,
scored: list[tuple[float, int, dict, str]] = [] ["_property_match_address", "_property_epc_match_address"],
for candidate in candidates:
register_address = candidate.get("_property_match_address")
epc_address = candidate.get("_property_epc_match_address")
register_numbers_compatible = bool(
register_address and _numbers_compatible(query, register_address)
)
epc_numbers_compatible = bool(
epc_address and _numbers_compatible(query, epc_address)
)
if not (register_numbers_compatible or epc_numbers_compatible):
continue
register_score = _address_score(query, register_address)
epc_score = _address_score(query, epc_address)
base_score = max(register_score, epc_score)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("_actual_property_type"),
candidate.get("pp_property_type"),
exact=7.0,
mismatch=-8.0,
)
score += _enum_bonus(
listing.get("_actual_leasehold_freehold"),
candidate.get("duration"),
exact=3.0,
mismatch=-3.0,
)
score += _ratio_bonus(
listing.get("_actual_total_floor_area"),
candidate.get("total_floor_area"),
pct=0.15,
cap=8.0,
)
score += _rooms_bonus(
listing.get("_actual_number_habitable_rooms"),
candidate.get("number_habitable_rooms"),
)
score += _ratio_bonus(
listing.get("_actual_asking_price"),
candidate.get("latest_price"),
pct=0.25,
cap=3.0,
)
matched_field = (
"pp_address" if register_score >= epc_score else "epc_address"
)
scored.append((score, base_score, candidate, matched_field))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
score_threshold = (
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
) )
address_threshold = ( if result is None:
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
if listing_has_numbers
else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
)
if (
top[0] < score_threshold
or top[1] < address_threshold
or margin < _PROPERTY_MATCH_MIN_MARGIN
):
return None return None
candidate, score, method, field = result
candidate = top[2] matched_field = {
"_property_match_address": "pp_address",
"_property_epc_match_address": "epc_address",
}.get(field, method)
return { return {
"_listing_idx": listing["_listing_idx"], "_listing_idx": listing["_listing_idx"],
"_matched_postcode": candidate.get("postcode"), "_matched_postcode": candidate.get("postcode"),
"_matched_pp_address": candidate.get("pp_address"), "_matched_pp_address": candidate.get("pp_address"),
"_property_match_score": round(top[0], 1), "_property_match_score": round(score, 1),
"_property_match_address_score": top[1], "_property_match_method": method,
"_property_match_margin": round(margin, 1), "_property_match_field": matched_field,
"_property_match_field": top[3],
} }
@ -1280,23 +1251,32 @@ def _match_listing_properties(
if listing_matches.is_empty() or property_candidates.is_empty(): if listing_matches.is_empty() or property_candidates.is_empty():
return _empty_listing_property_matches() return _empty_listing_property_matches()
buckets = _property_candidates_by_postcode(property_candidates) buckets, uprn_index = _index_candidates(
property_candidates, "_property_match_postcode", "uprn"
)
best_matches = [] best_matches = []
for listing in listing_matches.iter_rows(named=True): for listing in listing_matches.iter_rows(named=True):
postcode = listing.get("_listing_match_postcode") postcode = listing.get("_listing_match_postcode")
if not postcode: bucket = buckets.get(postcode, []) if postcode else []
continue match = _best_listing_property_candidate(listing, uprn_index, bucket)
match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
if match is not None: if match is not None:
best_matches.append(match) best_matches.append(match)
if not best_matches: if not best_matches:
return _empty_listing_property_matches() return _empty_listing_property_matches()
# When two listings claim the same property, keep the most authoritative
# match: an exact UPRN match always wins over a fuzzy address match (both can
# score 100, so method must break the tie before score and listing index).
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema()) matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
return ( return (
matches.sort( matches.sort(
["_property_match_score", "_listing_idx"], descending=[True, False] [
pl.col("_property_match_method") == "uprn",
"_property_match_score",
"_listing_idx",
],
descending=[True, True, False],
) )
.unique( .unique(
["_matched_postcode", "_matched_pp_address"], ["_matched_postcode", "_matched_pp_address"],
@ -1307,133 +1287,19 @@ def _match_listing_properties(
) )
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]: def _best_direct_epc_candidate(
buckets: dict[str, list[dict]] = {} listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
for row in candidates.iter_rows(named=True): ) -> dict | None:
postcode = row.get("_direct_epc_match_postcode") result = _best_listing_match(
if postcode: listing.get("_listing_uprn"),
buckets.setdefault(postcode, []).append(row) listing.get("_listing_match_address"),
return buckets uprn_index,
candidates,
["_direct_epc_match_address"],
def _epc_postcode_tree(
candidates: pl.DataFrame,
) -> tuple[cKDTree | None, list[str]]:
postcode_points = (
candidates.select(
"_direct_epc_match_postcode",
"_direct_epc_east",
"_direct_epc_north",
)
.drop_nulls()
.filter(
pl.col("_direct_epc_east").is_finite()
& pl.col("_direct_epc_north").is_finite()
)
.unique("_direct_epc_match_postcode")
) )
if postcode_points.is_empty(): if result is None:
return None, []
coords = np.column_stack(
[
postcode_points["_direct_epc_east"].to_numpy(),
postcode_points["_direct_epc_north"].to_numpy(),
]
)
return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
def _candidate_postcodes_for_listing(
listing: dict,
postcode_tree: cKDTree | None,
postcode_values: list[str],
) -> list[str]:
postcodes: list[str] = []
exact = listing.get("_listing_match_postcode")
if exact:
postcodes.append(exact)
if postcode_tree is None:
return postcodes
east = listing.get("_listing_east")
north = listing.get("_listing_north")
try:
east_f = float(east)
north_f = float(north)
except (TypeError, ValueError):
return postcodes
if not np.isfinite(east_f) or not np.isfinite(north_f):
return postcodes
k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
distances, indices = postcode_tree.query(
[east_f, north_f],
k=k,
distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
)
distances = np.atleast_1d(distances)
indices = np.atleast_1d(indices)
seen = set(postcodes)
for distance, idx in zip(distances, indices, strict=False):
if not np.isfinite(distance) or idx >= len(postcode_values):
continue
postcode = postcode_values[int(idx)]
if postcode not in seen:
postcodes.append(postcode)
seen.add(postcode)
return postcodes
def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None return None
candidate, score, method, _field = result
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict]] = []
for candidate in candidates:
address = candidate.get("_direct_epc_match_address")
if listing_has_numbers and not _numbers_compatible(query, address or ""):
continue
base_score = _address_score(query, address)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("_actual_property_type"),
candidate.get("_direct_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
score += _ratio_bonus(
listing.get("_actual_total_floor_area"),
candidate.get("_direct_total_floor_area"),
pct=0.12,
cap=8.0,
)
score += _rooms_bonus(
listing.get("_actual_number_habitable_rooms"),
candidate.get("_direct_number_habitable_rooms"),
)
scored.append((score, base_score, candidate))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
return None
candidate = top[2]
return { return {
"_listing_idx": listing["_listing_idx"], "_listing_idx": listing["_listing_idx"],
"_direct_epc_address": candidate.get("_direct_epc_address"), "_direct_epc_address": candidate.get("_direct_epc_address"),
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
), ),
"_direct_was_council_house": candidate.get("_direct_was_council_house"), "_direct_was_council_house": candidate.get("_direct_was_council_house"),
"_direct_epc_match_status": "matched", "_direct_epc_match_status": "matched",
"_direct_epc_match_score": round(top[0], 1), "_direct_epc_match_score": round(score, 1),
"_direct_epc_match_margin": round(margin, 1), "_direct_epc_match_method": method,
} }
@ -1463,25 +1329,14 @@ def _match_direct_epc(
if listing_matches.is_empty() or epc_candidates.is_empty(): if listing_matches.is_empty() or epc_candidates.is_empty():
return _empty_direct_epc_matches() return _empty_direct_epc_matches()
buckets = _epc_candidates_by_postcode(epc_candidates) buckets, uprn_index = _index_candidates(
postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates) epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
)
matches = [] matches = []
for listing in listing_matches.iter_rows(named=True): for listing in listing_matches.iter_rows(named=True):
candidate_postcodes = _candidate_postcodes_for_listing( postcode = listing.get("_listing_match_postcode")
listing, postcode_tree, postcode_values bucket = buckets.get(postcode, []) if postcode else []
) match = _best_direct_epc_candidate(listing, uprn_index, bucket)
candidate_rows: list[dict] = []
seen_rows: set[int] = set()
for postcode in candidate_postcodes:
for candidate in buckets.get(postcode, []):
row = candidate.get("_direct_epc_row")
if row in seen_rows:
continue
candidate_rows.append(candidate)
if row is not None:
seen_rows.add(row)
match = _best_direct_epc_candidate(listing, candidate_rows)
if match is not None: if match is not None:
matches.append(match) matches.append(match)
@ -1493,7 +1348,6 @@ def _match_direct_epc(
def _enrich_listings_with_direct_epc( def _enrich_listings_with_direct_epc(
listings: pl.DataFrame, listings: pl.DataFrame,
epc_path: Path | None, epc_path: Path | None,
arcgis_path: Path,
) -> pl.DataFrame: ) -> pl.DataFrame:
if epc_path is None: if epc_path is None:
return _ensure_direct_epc_columns(listings) return _ensure_direct_epc_columns(listings)
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
prefix="direct_listing_epc_", dir=local_tmp_dir() prefix="direct_listing_epc_", dir=local_tmp_dir()
) as tmpdir: ) as tmpdir:
epc_candidates = _load_direct_epc_candidates( epc_candidates = _load_direct_epc_candidates(
epc_path, arcgis_path, listing_outcodes, Path(tmpdir) epc_path, listing_outcodes, Path(tmpdir)
) )
print(f"Direct listing EPC candidates: {epc_candidates.height}") print(f"Direct listing EPC candidates: {epc_candidates.height}")
direct_matches = _match_direct_epc(listing_matches, epc_candidates) direct_matches = _match_direct_epc(listing_matches, epc_candidates)
@ -1604,7 +1458,7 @@ def _integrate_listings(
""" """
listings = _load_listings_for_merge(listings_path, arcgis_path) listings = _load_listings_for_merge(listings_path, arcgis_path)
print(f"Listings loaded: {listings.height}") print(f"Listings loaded: {listings.height}")
listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path) listings = _enrich_listings_with_direct_epc(listings, epc_path)
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES] overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
listing_attachment_columns = [ listing_attachment_columns = [
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
"""Project the post-rename wide frame down to enriched-listing rows.""" """Project the post-rename wide frame down to enriched-listing rows."""
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null()) df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
# A matched listing's overlay attaches to every wide row sharing its
# (postcode, pp_address). The terminated-postcode remap can collapse several
# distinct wide rows onto one such key, which would otherwise emit one duplicate
# listing per collapsed row. Each listing matches exactly one (postcode,
# pp_address) and each seed row carries a unique URL, so keeping a single row per
# listing URL collapses only that fan-out and never merges distinct listings.
df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
df = df.with_columns( df = df.with_columns(
pl.col("_actual_listing_url").alias("Listing URL"), pl.col("_actual_listing_url").alias("Listing URL"),
pl.col("_actual_listing_date").alias("Listing date"), pl.col("_actual_listing_date").alias("Listing date"),
@ -1750,7 +1612,6 @@ def _build(
broadband_path: Path, broadband_path: Path,
conservation_areas_path: Path, conservation_areas_path: Path,
rental_prices_path: Path, rental_prices_path: Path,
lsoa_population_path: Path,
median_age_path: Path, median_age_path: Path,
election_results_path: Path, election_results_path: Path,
tree_density_postcodes_path: Path | None = None, tree_density_postcodes_path: Path | None = None,
@ -1881,8 +1742,10 @@ def _build(
how="left", how="left",
) )
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
crime = pl.scan_parquet(crime_path) crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left") wide = wide.join(crime, on="postcode", how="left")
wide = wide.with_columns( wide = wide.with_columns(
pl.sum_horizontal( pl.sum_horizontal(
@ -1905,17 +1768,6 @@ def _build(
).alias("minor_crime_avg_yr"), ).alias("minor_crime_avg_yr"),
) )
lsoa_pop = pl.scan_parquet(lsoa_population_path)
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
wide = wide.with_columns(
pl.when(pl.col("population") > 0)
.then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
.alias("serious_crime_per_1k"),
pl.when(pl.col("population") > 0)
.then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
.alias("minor_crime_per_1k"),
).drop("population")
median_age = pl.scan_parquet(median_age_path) median_age = pl.scan_parquet(median_age_path)
wide = wide.join(median_age, on="lsoa21", how="left") wide = wide.join(median_age, on="lsoa21", how="left")
@ -2082,8 +1934,6 @@ def _build(
"max_download_speed": "Max available download speed (Mbps)", "max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)", "serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)", "minor_crime_avg_yr": "Minor crime (avg/yr)",
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent", "mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)", "floor_height": "Interior height (m)",
"was_council_house": "Former council house", "was_council_house": "Former council house",
@ -2189,12 +2039,6 @@ def main():
required=True, required=True,
help="ONS rental prices by LA and bedroom count parquet file", help="ONS rental prices by LA and bedroom count parquet file",
) )
parser.add_argument(
"--lsoa-population",
type=Path,
required=True,
help="Census 2021 population by LSOA parquet file",
)
parser.add_argument( parser.add_argument(
"--median-age", "--median-age",
type=Path, type=Path,
@ -2279,7 +2123,6 @@ def main():
broadband_path=args.broadband, broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas, conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices, rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age, median_age_path=args.median_age,
election_results_path=args.election_results, election_results_path=args.election_results,
tree_density_postcodes_path=args.tree_density_postcodes, tree_density_postcodes_path=args.tree_density_postcodes,

View file

@ -376,7 +376,7 @@ def main() -> None:
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles") "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
) )
parser.add_argument("--pmtiles-version", default="1.22.3") parser.add_argument("--pmtiles-version", default="1.22.3")
parser.add_argument("--min-zoom", type=int, default=13) parser.add_argument("--min-zoom", type=int, default=12)
parser.add_argument("--max-zoom", type=int, default=14) parser.add_argument("--max-zoom", type=int, default=14)
parser.add_argument("--tile-size", type=int, default=256) parser.add_argument("--tile-size", type=int, default=256)
args = parser.parse_args() args = parser.parse_args()

View file

@ -22,6 +22,12 @@ def main() -> None:
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data" description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
) )
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet") parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
parser.add_argument(
"--arcgis",
type=Path,
default=None,
help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
)
parser.add_argument( parser.add_argument(
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage" "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
) )
@ -46,7 +52,7 @@ def main() -> None:
print("=" * 60) print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries) oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn) uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
# Phase 2: Parse/load INSPIRE # Phase 2: Parse/load INSPIRE
print() print()

View file

@ -0,0 +1,105 @@
"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
postcode district under ``units/{district}.geojson``, each feature carrying a
``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
that test points against postcode geometry want those polygons back in British
National Grid (EPSG:27700) so buffers/distances are in metres.
:func:`load_postcode_polygons` reads the files, reprojects WGS8427700, repairs
invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
postcode so callers can use the array index as a stable postcode id -- the same
"buffer index == postcode index" convention used by ``tree_density``.
"""
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import shapely
from pyproj import Transformer
def _read_district(
path: Path, transformer: Transformer
) -> tuple[np.ndarray, np.ndarray]:
"""Return (postcodes, polygons_27700) for one district GeoJSON."""
with path.open() as file:
collection = json.load(file)
features = collection.get("features", [])
if not features:
return np.empty(0, dtype=object), np.empty(0, dtype=object)
postcodes = np.array(
[feature["properties"]["postcodes"] for feature in features], dtype=object
)
geom_json = np.array(
[json.dumps(feature["geometry"]) for feature in features], dtype=object
)
geoms = shapely.from_geojson(geom_json)
# Reproject every vertex in a single pyproj call, then rebuild the polygons.
coords = shapely.get_coordinates(geoms)
if coords.size:
x, y = transformer.transform(coords[:, 0], coords[:, 1])
geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
invalid = ~shapely.is_valid(geoms)
if invalid.any():
geoms[invalid] = shapely.make_valid(geoms[invalid])
return postcodes, geoms
def load_postcode_polygons(
units_dir: Path, max_postcodes: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
``max_postcodes`` (testing) keeps only the lexicographically-first N
postcodes, reading just enough district files to reach the cap.
"""
units_dir = Path(units_dir)
files = sorted(units_dir.glob("*.geojson"))
if not files:
raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
postcode_chunks: list[np.ndarray] = []
geom_chunks: list[np.ndarray] = []
total = 0
for path in files:
postcodes, geoms = _read_district(path, transformer)
if len(postcodes) == 0:
continue
postcode_chunks.append(postcodes)
geom_chunks.append(geoms)
total += len(postcodes)
if max_postcodes is not None and total >= max_postcodes:
break
if not postcode_chunks:
raise ValueError(f"No postcode features found in {units_dir}")
postcodes = np.concatenate(postcode_chunks)
geoms = np.concatenate(geom_chunks)
# Stable postcode order makes "index == postcode id" deterministic; dedupe
# defensively (a postcode lives in exactly one district file).
order = np.argsort(postcodes, kind="stable")
postcodes = postcodes[order]
geoms = geoms[order]
_, first = np.unique(postcodes, return_index=True)
postcodes = postcodes[first]
geoms = geoms[first]
if max_postcodes is not None and len(postcodes) > max_postcodes:
postcodes = postcodes[:max_postcodes]
geoms = geoms[:max_postcodes]
print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
return postcodes, geoms

View file

@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
loaded_df, _ = load_uprns(path) loaded_df, _ = load_uprns(path)
assert len(loaded_df) == 0 assert len(loaded_df) == 0
def test_non_english_oas_excluded(self, tmp_path):
df = pl.DataFrame(
{
"GRIDGB1E": [500010, 300010],
"GRIDGB1N": [180010, 220010],
"PCDS": ["AA1 1AA", "CF1 1AA"],
"OA21CD": ["E00000001", "W00000001"],
}
)
path = tmp_path / "uprn.parquet"
df.write_parquet(path)
loaded_df, offsets = load_uprns(path)
assert set(offsets) == {"E00000001"}
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
def test_terminated_postcodes_are_remapped(self, tmp_path):
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["aa1 1aa"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Bug 3: Voronoi deduplication is first-seen-wins # Bug 3: Voronoi deduplication is first-seen-wins

View file

@ -4,11 +4,18 @@ import numpy as np
import polars as pl import polars as pl
from pipeline.local_temp import local_tmp_dir from pipeline.local_temp import local_tmp_dir
from pipeline.utils.postcode_mapping import build_postcode_mapping
from .memory import release_memory from .memory import release_memory
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]: def _canonical_postcode_expr(name: str) -> pl.Expr:
return pl.col(name).str.strip_chars().str.to_uppercase()
def load_uprns(
uprn_path: Path, arcgis_path: Path | None = None
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup. """Load UPRNs as a sorted polars DataFrame with OA offset lookup.
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row). Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
import tempfile import tempfile
print("Loading UPRN lookup...") print("Loading UPRN lookup...")
mapping = None
if arcgis_path is not None:
mapping = (
build_postcode_mapping(arcgis_path)
.with_columns(
_canonical_postcode_expr("old_postcode").alias("old_postcode"),
_canonical_postcode_expr("new_postcode").alias("new_postcode"),
)
.unique("old_postcode")
)
# Sort via streaming sink to avoid polars doubling memory during in-memory sort # Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False, dir=local_tmp_dir() suffix=".parquet", delete=False, dir=local_tmp_dir()
) as tmp: ) as tmp:
tmp_path = Path(tmp.name) tmp_path = Path(tmp.name)
( uprns = (
pl.scan_parquet(uprn_path) pl.scan_parquet(uprn_path)
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD") .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
.filter(~pl.col("OA21CD").str.starts_with("S")) .filter(pl.col("OA21CD").str.starts_with("E"))
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null()) .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
.with_columns(pl.col("PCDS").str.strip_chars()) .with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != "")) .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
.sort("OA21CD")
.sink_parquet(tmp_path)
) )
if mapping is not None and mapping.height > 0:
uprns = (
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
)
uprns.sort("OA21CD").sink_parquet(tmp_path)
release_memory() release_memory()
# Read the sorted data — only one copy in memory (~2GB) # Read the sorted data — only one copy in memory (~2GB)
df = pl.read_parquet(tmp_path) df = pl.read_parquet(tmp_path)
tmp_path.unlink() tmp_path.unlink()
n = len(df) n = len(df)
print(f" Loaded {n:,} UPRNs (England & Wales)") print(f" Loaded {n:,} UPRNs (England)")
# Compute OA group offsets using polars (avoids 37M Python string creation) # Compute OA group offsets using polars (avoids 37M Python string creation)
boundary_df = ( boundary_df = (

View file

@ -0,0 +1,138 @@
"""Build PMTiles polygon tiles for the INSPIRE property-border overlay.
Reads the HM Land Registry INSPIRE Index Polygons (per-local-authority GML ZIPs
in EPSG:27700), reprojects each parcel to WGS84, and tiles the outlines with
tippecanoe. The dashboard serves the resulting archive through
``/api/overlays/property-borders`` and renders it as thin outlines only at the
postcode zoom level.
The same ZIPs are already downloaded for postcode-boundary generation; this
target re-uses :func:`parse_inspire_zip` to stay self-contained and is wired to
the ``$(INSPIRE_STAMP)`` make dependency rather than the boundary cache.
Data: HM Land Registry INSPIRE Index Polygons, Open Government Licence v3.0.
Boundaries are indicative "general boundaries", not the legal extent of title.
"""
from __future__ import annotations
import argparse
import shutil
import subprocess
import tempfile
from pathlib import Path
import numpy as np
import shapely
from pyproj import Transformer
from shapely.geometry import Polygon
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.postcode_boundaries.inspire import parse_inspire_zip
def _require_tippecanoe() -> str:
executable = shutil.which("tippecanoe")
if executable is None:
raise RuntimeError(
"tippecanoe is required to build property border PMTiles. "
"Install tippecanoe and rerun this target."
)
return executable
def _write_property_geojsonseq(inspire_dir: Path, output_path: Path) -> int:
"""Stream INSPIRE parcels to a WGS84 GeoJSONSeq file, one feature per line.
Features carry no properties the overlay only draws outlines, so dropping
attributes keeps the tiles as small as possible. Reprojection and GeoJSON
encoding are vectorised per ZIP (one local authority) to bound memory while
staying in shapely's C path.
"""
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
zip_files = sorted(inspire_dir.glob("*.zip"))
if not zip_files:
raise RuntimeError(f"No INSPIRE ZIP files found in {inspire_dir}")
feature_count = 0
with output_path.open("w") as file:
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
rings = parse_inspire_zip(zip_path) # list of Nx2 (easting, northing)
if not rings:
continue
geoms = np.array([Polygon(coords) for coords in rings], dtype=object)
# interleaved=False → transform(x, y) called once with full arrays.
geoms = shapely.transform(geoms, to_wgs84.transform, interleaved=False)
for geometry_json in shapely.to_geojson(geoms):
file.write('{"type":"Feature","properties":{},"geometry":')
file.write(geometry_json)
file.write("}\n")
feature_count += 1
return feature_count
def build_property_border_tiles(
inspire_dir: Path,
output_path: Path,
min_zoom: int,
max_zoom: int,
) -> None:
tippecanoe = _require_tippecanoe()
output_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
ndjson_path = Path(tmp) / "property_borders.geojsonseq"
feature_count = _write_property_geojsonseq(inspire_dir, ndjson_path)
print(f"Writing {feature_count:,} INSPIRE parcel polygons")
subprocess.run(
[
tippecanoe,
"--force",
"--output",
str(output_path),
"--layer",
"property_borders",
"--minimum-zoom",
str(min_zoom),
"--maximum-zoom",
str(max_zoom),
# Borders are only meaningful at street level; thin the densest
# tiles at low zoom but keep full geometry at max zoom.
"--drop-smallest-as-needed",
"--simplify-only-low-zooms",
"--extend-zooms-if-still-dropping",
"--temporary-directory",
tmp,
str(ndjson_path),
],
check=True,
)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output .pmtiles path"
)
parser.add_argument("--min-zoom", type=int, default=12)
parser.add_argument("--max-zoom", type=int, default=16)
args = parser.parse_args()
build_property_border_tiles(
inspire_dir=args.inspire,
output_path=args.output,
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,147 @@
import json
import polars as pl
from pyproj import Transformer
from pipeline.transform.crime_spatial import transform_crime_spatial
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
_CSV_HEADER = (
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
"LSOA code,LSOA name,Crime type,Last outcome category,Context"
)
def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]:
lon, lat = _TO_WGS84.transform(x, y)
return lon, lat
def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict:
ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)]
coords = [list(_bng_to_wgs84(x, y)) for x, y in ring]
return {
"type": "Feature",
"properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")},
"geometry": {"type": "Polygon", "coordinates": [coords]},
}
def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None:
units_dir.mkdir(parents=True)
for district, features in features_by_district.items():
collection = {"type": "FeatureCollection", "features": features}
(units_dir / f"{district}.geojson").write_text(json.dumps(collection))
def _crime_row(month: str, x, y, crime_type: str) -> str:
if x is None or y is None:
lon, lat = "", ""
else:
lon, lat = _bng_to_wgs84(x, y)
return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
def _write_month(crime_dir, month: str, rows: list[str]) -> None:
month_dir = crime_dir / month
month_dir.mkdir(parents=True)
body = "\n".join([_CSV_HEADER, *rows]) + "\n"
(month_dir / f"{month}-test-force-street.csv").write_text(body)
def test_buffer_overlap_counts_for_each_postcode(tmp_path):
units = tmp_path / "units"
# A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060].
_write_boundaries(
units,
{
"AB1": [
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
_square_feature("AB1 1AB", 1080, 1000, 1090, 1010),
_square_feature("AB1 1AC", 5000, 5000, 5010, 5010),
]
},
)
crime = tmp_path / "crime"
_write_month(
crime,
"2024-01",
[
# In the overlap: 35m east of A, 35m west of B -> counts for both.
_crime_row("2024-01", 1045, 1005, "Burglary"),
# 49m east of C's edge -> inside C's buffer.
_crime_row("2024-01", 5059, 5005, "Robbery"),
# 51m east of C's edge -> outside every buffer.
_crime_row("2024-01", 5061, 5005, "Robbery"),
# No coordinate -> dropped entirely.
_crime_row("2024-01", None, None, "Anti-social behaviour"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year)
rows = {
r["postcode"]: r
for r in pl.read_parquet(output).to_dicts()
}
# Single month -> annualised x12.
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
# Only the 49m robbery counts for C; the 51m one and the blank row do not.
assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0
assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0
# Anti-social behaviour had no coordinate -> nobody gets it.
assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values())
def test_by_year_annualises_and_rolls_up(tmp_path):
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
# Point at the centre of AB1 1AA, well inside its buffer.
_write_month(
crime,
"2023-01",
[
_crime_row("2023-01", 1005, 1005, "Burglary"),
_crime_row("2023-01", 1005, 1005, "Robbery"),
],
)
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
_write_month(
crime,
"2024-02",
[
_crime_row("2024-02", 1005, 1005, "Burglary"),
_crime_row("2024-02", 1005, 1005, "Anti-social behaviour"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year)
by_year_df = pl.read_parquet(by_year)
assert by_year_df.height == 1
cols = set(by_year_df.columns)
assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols
row = by_year_df.row(0, named=True)
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
# 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
assert burglary == [
{"year": 2023, "count": 12.0},
{"year": 2024, "count": 12.0},
]
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
assert serious[2023] == 24.0
assert serious[2024] == 12.0

View file

@ -24,6 +24,7 @@ def _row(**overrides: str) -> dict[str, str]:
row = { row = {
"address": "1 Example Street", "address": "1 Example Street",
"postcode": " aa1 1aa ", "postcode": " aa1 1aa ",
"uprn": "100012345678",
"current_energy_rating": "c", "current_energy_rating": "c",
"potential_energy_rating": "b", "potential_energy_rating": "b",
"property_type": "House", "property_type": "House",
@ -52,6 +53,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
{ {
"epc_address": "1 Example Street", "epc_address": "1 Example Street",
"epc_postcode": "AA1 1AA", "epc_postcode": "AA1 1AA",
"uprn": "100012345678",
"current_energy_rating": "C", "current_energy_rating": "C",
"potential_energy_rating": "B", "potential_energy_rating": "B",
"epc_property_type": "House", "epc_property_type": "House",

View file

@ -15,6 +15,8 @@ from pipeline.transform.merge import (
_finalize_listings, _finalize_listings,
_integrate_listings, _integrate_listings,
_match_direct_epc, _match_direct_epc,
_match_listing_properties,
_normalize_uprn,
_is_dynamic_poi_metric_column, _is_dynamic_poi_metric_column,
_less_deprived_percentile_expr, _less_deprived_percentile_expr,
_load_conservation_area_geometries, _load_conservation_area_geometries,
@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_listed_building_feature_is_property_level() -> None: def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
assert seed["was_council_house"].to_list() == ["No"] assert seed["was_council_house"].to_list() == ["No"]
def test_match_direct_epc_considers_nearby_postcodes() -> None: _DIRECT_EPC_CANDIDATE_SCHEMA = {
listing_matches = pl.DataFrame( "_direct_epc_row": pl.UInt32,
{ "_direct_epc_match_address": pl.Utf8,
"_listing_idx": [0], "_direct_epc_match_postcode": pl.Utf8,
"_listing_match_address": ["1 EXAMPLE ROAD"], "_direct_epc_outcode": pl.Utf8,
"_listing_match_postcode": ["AA11AA"], "_direct_epc_canonical_property_type": pl.Utf8,
"_listing_east": [1000.0], "_direct_epc_uprn": pl.Utf8,
"_listing_north": [1000.0], "_direct_epc_address": pl.Utf8,
"_actual_property_type": ["Terraced"], "_direct_current_energy_rating": pl.Utf8,
"_actual_total_floor_area": [100.0], "_direct_potential_energy_rating": pl.Utf8,
"_actual_number_habitable_rooms": [4], "_direct_total_floor_area": pl.Float64,
}, "_direct_number_habitable_rooms": pl.Int16,
schema={ "_direct_floor_height": pl.Float64,
"_listing_idx": pl.UInt32, "_direct_construction_age_band": pl.UInt16,
"_listing_match_address": pl.Utf8, "_direct_is_construction_date_approximate": pl.UInt8,
"_listing_match_postcode": pl.Utf8, "_direct_was_council_house": pl.Utf8,
"_listing_east": pl.Float64, }
"_listing_north": pl.Float64,
"_actual_property_type": pl.Utf8, _LISTING_MATCH_SCHEMA = {
"_actual_total_floor_area": pl.Float64, "_listing_idx": pl.UInt32,
"_actual_number_habitable_rooms": pl.Int16, "_listing_match_address": pl.Utf8,
}, "_listing_match_postcode": pl.Utf8,
) "_listing_uprn": pl.Utf8,
epc_candidates = pl.DataFrame( }
{
"_direct_epc_row": [0],
"_direct_epc_match_address": ["1 EXAMPLE ROAD"], def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
"_direct_epc_match_postcode": ["BB11BB"], base = {
"_direct_epc_east": [1020.0], "_direct_epc_row": 0,
"_direct_epc_north": [1010.0], "_direct_epc_match_address": "1 EXAMPLE ROAD",
"_direct_epc_canonical_property_type": ["Terraced"], "_direct_epc_match_postcode": "AA11AA",
"_direct_epc_address": ["1, Example Road"], "_direct_epc_outcode": "AA1",
"_direct_current_energy_rating": ["C"], "_direct_epc_canonical_property_type": "Terraced",
"_direct_potential_energy_rating": ["B"], "_direct_epc_uprn": None,
"_direct_total_floor_area": [101.0], "_direct_epc_address": "1, Example Road",
"_direct_number_habitable_rooms": [4], "_direct_current_energy_rating": "C",
"_direct_floor_height": [2.5], "_direct_potential_energy_rating": "B",
"_direct_construction_age_band": [1930], "_direct_total_floor_area": 101.0,
"_direct_is_construction_date_approximate": [1], "_direct_number_habitable_rooms": 4,
"_direct_was_council_house": ["No"], "_direct_floor_height": 2.5,
}, "_direct_construction_age_band": 1930,
schema={ "_direct_is_construction_date_approximate": 1,
"_direct_epc_row": pl.UInt32, "_direct_was_council_house": "No",
"_direct_epc_match_address": pl.Utf8, }
"_direct_epc_match_postcode": pl.Utf8, return pl.DataFrame(
"_direct_epc_east": pl.Float64, [{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
"_direct_epc_north": pl.Float64,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
},
) )
matches = _match_direct_epc(listing_matches, epc_candidates)
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
base = {
"_listing_idx": 0,
"_listing_match_address": "1 EXAMPLE ROAD",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
}
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
# UPRN is matched globally (not within a postcode bucket), so a listing
# whose detail-page postcode is slightly off still resolves to the right
# EPC certificate by its UPRN.
matches = _match_direct_epc(
_listing_matches(
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
),
)
assert matches.height == 1 assert matches.height == 1
assert matches["_listing_idx"].to_list() == [0]
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["address"]
def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(None) is None
assert _normalize_uprn("") is None
assert _normalize_uprn(" 100012345678 ") == "100012345678"
assert _normalize_uprn(100012345678) == "100012345678"
# An integral float normalises to its digits, NOT "1230".
assert _normalize_uprn(123.0) == "123"
# Non-integral / NaN floats are rejected rather than mangled.
assert _normalize_uprn(1.5) is None
assert _normalize_uprn(float("nan")) is None
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",
"pp_address": "1 Example Road",
"_property_match_postcode": "AA11AA",
"_property_match_address": "1 EXAMPLE ROAD",
"_property_epc_match_address": "1 EXAMPLE ROAD",
"uprn": None,
}
return pl.DataFrame(
[{**base, **row} for row in rows],
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"_property_match_postcode": pl.Utf8,
"_property_match_address": pl.Utf8,
"_property_epc_match_address": pl.Utf8,
"uprn": pl.Utf8,
},
)
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
# Two listings claim the same property: one by UPRN, one by exact address
# (both score 100). The UPRN match must win even though it has the higher
# _listing_idx (which would otherwise break the tie the wrong way).
listings = _listing_matches(
[
{
"_listing_idx": 5,
"_listing_uprn": "100000000001",
"_listing_match_address": "SOMETHING ELSE",
},
{
"_listing_idx": 1,
"_listing_uprn": None,
"_listing_match_address": "1 EXAMPLE ROAD",
},
]
)
matches = _match_listing_properties(
listings, _property_candidates([{"uprn": "100000000001"}])
)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [5]
assert matches["_property_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
# same-street EPC in a different postcode with no shared UPRN is skipped.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
_direct_epc_candidates(
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
),
)
assert matches.height == 0
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None: def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -
assert other["_actual_listing_url"].to_list() == [None] assert other["_actual_listing_url"].to_list() == [None]
def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None: def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
# The listing's address deliberately does not match the property's, but the
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
listings_path = tmp_path / "listings.parquet" listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet" arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns( _sample_listings_frame().with_columns(
pl.lit("Rose Cottage High Street").alias("Address per Property Register"), pl.lit("Totally Different Road").alias("Address per Property Register"),
pl.lit("100000000009").alias("UPRN"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["1 Example Road"],
"uprn": ["100000000009"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [90.0],
"number_habitable_rooms": [4],
"latest_price": [600_000],
"epc_address": ["1 Example Road"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"uprn": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
# The listing overlay attached to the UPRN-matched property row.
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
# No spurious seed row for the listing's (non-matching) address.
assert "Totally Different Road" not in integrated["pp_address"].to_list()
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
# A number-less listing whose street is not the property's street (and which
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
# row instead of stamping the wrong property's overlay.
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Juniper Crescent").alias("Address per Property Register"),
).write_parquet(listings_path) ).write_parquet(listings_path)
_stub_arcgis(arcgis_path) _stub_arcgis(arcgis_path)
wide = pl.DataFrame( wide = pl.DataFrame(
@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) ->
).collect() ).collect()
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street") existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street") seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
assert existing["_actual_listing_url"].to_list() == [None] assert existing["_actual_listing_url"].to_list() == [None]
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
# Overlay scaffolding is dropped. # Overlay scaffolding is dropped.
for src, dst, _dt in _LISTING_OVERLAY_SOURCES: for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
assert dst not in finalized.columns, src assert dst not in finalized.columns, src
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
# The terminated-postcode remap can collapse two distinct wide rows onto the same
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
# must emit one row per listing URL, not one per collapsed wide row.
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "1 Example Road"],
"Address per EPC": ["1 Example Road", "1 Example Road"],
"Date of last transaction": [1990.0, 1995.0],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, 3],
"Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000],
"Street tree density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000],
"_actual_asking_price_per_sqm": [5_000, 5_000],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Garden"]],
"_actual_bedrooms": [3, 3],
"_actual_bathrooms": [1, 1],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
"_actual_lat": [51.51, 51.51],
"_actual_lon": [-0.11, -0.11],
"_actual_total_floor_area": [110.0, 110.0],
"_actual_number_habitable_rooms": [4, 4],
"_actual_property_type": ["Terraced", "Terraced"],
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df)
assert finalized.height == 1
assert finalized["Listing URL"].to_list() == ["url0"]

View file

@ -1,19 +1,83 @@
import math
from pathlib import Path from pathlib import Path
import numpy as np
import polars as pl import polars as pl
import pytest import pytest
import shapely
from pipeline.transform.tree_density import ( from pipeline.transform.tree_density import (
STREET_TREE_COVERAGE_COL, STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL, STREET_TREE_DENSITY_COL,
_add_nfi_batch,
_coverage_percentile_expr, _coverage_percentile_expr,
_metric_columns, _metric_columns,
_postcode_buffers,
_postcode_density_percentile_col, _postcode_density_percentile_col,
_with_postcode_density_percentiles, _with_postcode_density_percentiles,
_write_street_rollups, _write_street_rollups,
) )
def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
radius_m = 50
points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
circles, tree = _postcode_buffers(points, radius_m)
buffer_area = math.pi * radius_m * radius_m
# A large woodland square centred on postcode A fully covers A's circle.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
# Only the clipped circle area is added (the 32-gon buffer approximates the
# circle to ~1%), NOT the full 1,000,000 sqm polygon.
assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
assert canopy_area[0] <= buffer_area # never exceeds the buffer area
assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap
assert feature_count.tolist() == [1, 0]
# A large parcel that only slivers into B's circle must add only the sliver,
# not its full area -- the failure mode the old centroid path could not avoid.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle
_add_nfi_batch(
np.array([sliver], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
assert canopy_area[0] == 0.0
assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm
# Non-woodland categories contribute nothing.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Non woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
assert canopy_area.tolist() == [0.0, 0.0]
assert feature_count.tolist() == [0, 0]
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None: def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]}) df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})

View file

@ -1,10 +1,16 @@
"""Derive street-scale tree density metrics from Forest Research TOW data. """Derive street-scale tree density metrics from Forest Research TOW + NFI data.
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
postcode-level metric from the tree polygons, then optionally rolls that up to postcode-level metric from the tree polygons, then optionally rolls that up to
Price Paid street names so the dashboard can answer "what is this address's Price Paid street names so the dashboard can answer "what is this address's
street like?" without loading the full geodatabase at runtime. street like?" without loading the full geodatabase at runtime.
TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
proximity (tiny crowns), while large NFI woodland parcels are accumulated by
true buffer-clipped intersection area so they cannot saturate a postcode from
mere centroid proximity.
""" """
from __future__ import annotations from __future__ import annotations
@ -22,7 +28,6 @@ import shapely
from scipy.spatial import cKDTree from scipy.spatial import cKDTree
DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb" TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
STREET_TREE_DENSITY_COL = "Street tree density percentile" STREET_TREE_DENSITY_COL = "Street tree density percentile"
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)" STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
POSTCODE_COUNT_COL = "Tree features within {radius}m" POSTCODE_COUNT_COL = "Tree features within {radius}m"
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)" POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
NFI_CATEGORY_COL = "CATEGORY"
NFI_WOODLAND_VALUE = "Woodland"
NFI_TYPE_COL = "IFT_IOA"
NFI_AREA_HA_COL = "Area_ha"
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path: def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
"""Extract the TOW zip and return the extracted .gdb path.""" """Extract the TOW zip and return the extracted .gdb path."""
@ -83,12 +96,60 @@ def _tow_dataset_path(
return str(_safe_extract_zip(zip_path, extract_dir, force_extract)) return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None: def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
if not tow_types: """Extract an arbitrary zip into extract_dir and return the directory."""
return None if extract_dir.exists() and not force:
escaped = [tow_type.replace("'", "''") for tow_type in tow_types] print(f"Using existing extraction directory: {extract_dir}")
values = ", ".join(f"'{tow_type}'" for tow_type in escaped) return extract_dir
return f"Woodland_Type IN ({values})" if extract_dir.exists():
shutil.rmtree(extract_dir)
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
tmp_dir.mkdir(parents=True)
root = tmp_dir.resolve()
print(f"Extracting {zip_path} to {extract_dir}...")
with zipfile.ZipFile(zip_path) as archive:
for member in archive.infolist():
target = (tmp_dir / member.filename).resolve()
if root != target and root not in target.parents:
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
if member.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
target.parent.mkdir(parents=True, exist_ok=True)
with archive.open(member) as source, target.open("wb") as dest:
shutil.copyfileobj(source, dest, length=1024 * 1024)
tmp_dir.rename(extract_dir)
print(f"Extracted archive: {extract_dir}")
return extract_dir
def _nfi_dataset_path(
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
) -> str:
"""Resolve the NFI woodland shapefile path, extracting the zip if needed."""
if use_vsizip:
return f"/vsizip/{zip_path.resolve()}"
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
shapefiles = sorted(extracted.rglob("*.shp"))
if not shapefiles:
raise FileNotFoundError(f"No .shp found inside {zip_path}")
return str(shapefiles[0])
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
"""Resolve the geometry column name from pyogrio Arrow metadata."""
geometry_name = metadata.get("geometry_name")
if geometry_name:
return str(geometry_name)
for name in ("wkb_geometry", "geometry", "geom"):
if name in column_names:
return name
return column_names[-1]
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame: def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
@ -172,26 +233,20 @@ def _accumulate_tree_metrics(
dataset_path: str, dataset_path: str,
points: pl.DataFrame, points: pl.DataFrame,
radius_m: int, radius_m: int,
tow_types: tuple[str, ...] | None,
batch_size: int, batch_size: int,
layer_names: tuple[str, ...] | None, layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None, max_features_per_layer: int | None,
workers: int, workers: int,
) -> pl.DataFrame: canopy_area: np.ndarray,
feature_count: np.ndarray,
height_weighted_sum: np.ndarray,
height_weight: np.ndarray,
) -> None:
xy = points.select("x", "y").to_numpy() xy = points.select("x", "y").to_numpy()
tree = cKDTree(xy) tree = cKDTree(xy)
n_points = points.height
canopy_area = np.zeros(n_points, dtype=np.float64)
feature_count = np.zeros(n_points, dtype=np.uint32)
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
height_weight = np.zeros(n_points, dtype=np.float64)
where = _where_for_tow_types(tow_types)
layers = _layers(dataset_path, layer_names) layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}") print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
if where:
print(f"TOW type filter: {where}")
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"] columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
total_features_seen = 0 total_features_seen = 0
@ -206,7 +261,6 @@ def _accumulate_tree_metrics(
dataset_path, dataset_path,
layer=layer, layer=layer,
columns=columns, columns=columns,
where=where,
batch_size=batch_size, batch_size=batch_size,
use_pyarrow=True, use_pyarrow=True,
) as (_meta, reader): ) as (_meta, reader):
@ -297,6 +351,132 @@ def _accumulate_tree_metrics(
f"{total_features_used:,} features with usable centroids" f"{total_features_used:,} features with usable centroids"
) )
def _postcode_buffers(
points: pl.DataFrame, radius_m: int
) -> tuple[np.ndarray, shapely.STRtree]:
"""Build a radius-r circle for every postcode plus an STRtree over them.
Circle index == postcode index, matching the order used by the cKDTree path.
"""
xy = points.select("x", "y").to_numpy()
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
return circles, shapely.STRtree(circles)
def _add_nfi_batch(
geoms: np.ndarray,
category: np.ndarray,
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
feature_count: np.ndarray,
radius_m: int,
) -> None:
"""Add NFI woodland into the shared arrays by true buffer-clipped area.
Unlike the TOW centroid path, this clips each woodland polygon to each
nearby postcode circle and adds only area(polygon circle); a large parcel
therefore cannot saturate a postcode from mere centroid proximity, and a
buffer-filling parcel whose centroid is outside the radius is not missed.
"""
keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
geoms = geoms[keep]
if geoms.size:
geoms = geoms[~shapely.is_empty(geoms)]
if geoms.size == 0:
return
# dwithin(polygon, point, r) is true iff the radius-r circle around the
# point intersects the polygon -- exactly the candidate set we want.
nfi_index, postcode_index = tree.query(
geoms, predicate="dwithin", distance=radius_m
)
if nfi_index.size == 0:
return
clipped_area = shapely.area(
shapely.intersection(geoms[nfi_index], circles[postcode_index])
)
positive = clipped_area > 0
postcode_index = postcode_index[positive]
clipped_area = clipped_area[positive]
np.add.at(canopy_area, postcode_index, clipped_area)
np.add.at(feature_count, postcode_index, 1)
def _accumulate_nfi_metrics(
dataset_path: str,
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
feature_count: np.ndarray,
radius_m: int,
batch_size: int,
max_nfi_features: int | None,
) -> None:
layers = _layers(dataset_path, None)
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
# Density only needs the woodland flag + geometry; area is clipped from the
# postcode buffer, not read from the file.
columns = [NFI_CATEGORY_COL]
features_seen = 0
for layer in layers:
with pyogrio.open_arrow(
dataset_path,
layer=layer,
columns=columns,
batch_size=batch_size,
use_pyarrow=True,
) as (meta, reader):
for batch_index, batch in enumerate(reader, start=1):
if max_nfi_features is not None:
remaining = max_nfi_features - features_seen
if remaining <= 0:
break
if batch.num_rows > remaining:
batch = batch.slice(0, remaining)
features_seen += batch.num_rows
names = batch.schema.names
geometry_column = _geometry_column(meta, names)
category = np.asarray(
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
zero_copy_only=False
),
dtype=object,
)
geometry = np.asarray(
batch.column(names.index(geometry_column)).to_numpy(
zero_copy_only=False
),
dtype=object,
)
_add_nfi_batch(
shapely.from_wkb(geometry),
category,
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
if batch_index == 1 or batch_index % 25 == 0:
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
def _finalize_metrics(
points: pl.DataFrame,
canopy_area: np.ndarray,
feature_count: np.ndarray,
height_weighted_sum: np.ndarray,
height_weight: np.ndarray,
radius_m: int,
) -> pl.DataFrame:
n_points = points.height
density_col, area_col, count_col, height_col = _metric_columns(radius_m) density_col, area_col, count_col, height_col = _metric_columns(radius_m)
buffer_area = math.pi * radius_m * radius_m buffer_area = math.pi * radius_m * radius_m
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0) density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
@ -518,6 +698,18 @@ def main() -> None:
action="store_true", action="store_true",
help="Read the geodatabase directly from the zip instead of extracting it", help="Read the geodatabase directly from the zip instead of extracting it",
) )
parser.add_argument(
"--nfi-zip",
type=Path,
default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
)
parser.add_argument(
"--nfi-extract-dir",
type=Path,
default=Path("property-data/nfi_woodland_england"),
help="Directory where the NFI zip is extracted",
)
parser.add_argument( parser.add_argument(
"--arcgis", "--arcgis",
type=Path, type=Path,
@ -554,11 +746,6 @@ def main() -> None:
default=50, default=50,
help="Radius around each postcode centroid used as the street-scale buffer", help="Radius around each postcode centroid used as the street-scale buffer",
) )
parser.add_argument(
"--tow-types",
default=",".join(DEFAULT_TOW_TYPES),
help='Comma-separated Woodland_Type values to include, or "all"',
)
parser.add_argument( parser.add_argument(
"--layers", "--layers",
default=None, default=None,
@ -588,6 +775,12 @@ def main() -> None:
default=None, default=None,
help="Testing only: process at most N TOW features per layer", help="Testing only: process at most N TOW features per layer",
) )
parser.add_argument(
"--max-nfi-features",
type=int,
default=None,
help="Testing only: process at most N NFI woodland features",
)
args = parser.parse_args() args = parser.parse_args()
if (args.output_streets or args.output_addresses) and args.price_paid is None: if (args.output_streets or args.output_addresses) and args.price_paid is None:
@ -600,18 +793,53 @@ def main() -> None:
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
) )
points = _postcode_points(args.arcgis, args.max_postcodes) points = _postcode_points(args.arcgis, args.max_postcodes)
tow_types = _parse_csv_arg(args.tow_types)
layer_names = _parse_csv_arg(args.layers) layer_names = _parse_csv_arg(args.layers)
postcode_metrics = _accumulate_tree_metrics( n_points = points.height
canopy_area = np.zeros(n_points, dtype=np.float64)
feature_count = np.zeros(n_points, dtype=np.uint32)
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
height_weight = np.zeros(n_points, dtype=np.float64)
_accumulate_tree_metrics(
dataset_path=dataset_path, dataset_path=dataset_path,
points=points, points=points,
radius_m=args.radius_m, radius_m=args.radius_m,
tow_types=tow_types,
batch_size=args.batch_size, batch_size=args.batch_size,
layer_names=layer_names, layer_names=layer_names,
max_features_per_layer=args.max_features_per_layer, max_features_per_layer=args.max_features_per_layer,
workers=args.workers, workers=args.workers,
canopy_area=canopy_area,
feature_count=feature_count,
height_weighted_sum=height_weighted_sum,
height_weight=height_weight,
)
if args.nfi_zip is not None and args.nfi_zip.exists():
nfi_path = _nfi_dataset_path(
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
)
circles, nfi_tree = _postcode_buffers(points, args.radius_m)
_accumulate_nfi_metrics(
dataset_path=nfi_path,
circles=circles,
tree=nfi_tree,
canopy_area=canopy_area,
feature_count=feature_count,
radius_m=args.radius_m,
batch_size=args.batch_size,
max_nfi_features=args.max_nfi_features,
)
elif args.nfi_zip is not None:
print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
postcode_metrics = _finalize_metrics(
points,
canopy_area,
feature_count,
height_weighted_sum,
height_weight,
args.radius_m,
) )
postcode_metrics = _with_postcode_density_percentiles( postcode_metrics = _with_postcode_density_percentiles(
postcode_metrics, args.radius_m postcode_metrics, args.radius_m

View file

@ -1,4 +1,4 @@
"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay.""" """Build PMTiles polygon tiles for the Trees Outside Woodland + NFI overlay."""
from __future__ import annotations from __future__ import annotations
@ -16,10 +16,14 @@ from pyproj import Transformer
from pipeline.local_temp import local_tmp_dir from pipeline.local_temp import local_tmp_dir
from pipeline.transform.tree_density import ( from pipeline.transform.tree_density import (
DEFAULT_TOW_TYPES, NFI_AREA_HA_COL,
NFI_CATEGORY_COL,
NFI_TYPE_COL,
NFI_WOODLAND_VALUE,
_geometry_column,
_layers, _layers,
_nfi_dataset_path,
_tow_dataset_path, _tow_dataset_path,
_where_for_tow_types,
) )
@ -55,17 +59,13 @@ def _number_or_none(value) -> float | int | None:
def _write_tree_geojsonseq( def _write_tree_geojsonseq(
dataset_path: str, dataset_path: str,
output_path: Path, output_path: Path,
tow_types: tuple[str, ...],
batch_size: int, batch_size: int,
layer_names: tuple[str, ...] | None, layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None, max_features_per_layer: int | None,
) -> int: ) -> int:
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
where = _where_for_tow_types(tow_types)
layers = _layers(dataset_path, layer_names) layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}") print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
if where:
print(f"TOW type filter: {where}")
columns = [ columns = [
"TOW_ID", "TOW_ID",
@ -88,7 +88,6 @@ def _write_tree_geojsonseq(
dataset_path, dataset_path,
layer=layer, layer=layer,
columns=columns, columns=columns,
where=where,
batch_size=batch_size, batch_size=batch_size,
use_pyarrow=True, use_pyarrow=True,
) as (_meta, reader): ) as (_meta, reader):
@ -136,6 +135,7 @@ def _write_tree_geojsonseq(
for idx, geometry_json in zip(valid_indexes, geometries_json): for idx, geometry_json in zip(valid_indexes, geometries_json):
properties = { properties = {
"source": "tow",
"tow_id": str(tow_id[idx]) if tow_id is not None else "", "tow_id": str(tow_id[idx]) if tow_id is not None else "",
"woodland_type": ( "woodland_type": (
str(woodland_type[idx]) str(woodland_type[idx])
@ -176,11 +176,105 @@ def _write_tree_geojsonseq(
return feature_count return feature_count
def _append_nfi_geojsonseq(
dataset_path: str,
output_path: Path,
batch_size: int,
max_nfi_features: int | None,
) -> int:
"""Append NFI woodland polygons to the same GeoJSONSeq as the TOW features."""
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
layers = _layers(dataset_path, None)
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
columns = [NFI_CATEGORY_COL, NFI_TYPE_COL, NFI_AREA_HA_COL]
feature_count = 0
features_seen = 0
with output_path.open("a") as file:
for layer in layers:
with pyogrio.open_arrow(
dataset_path,
layer=layer,
columns=columns,
batch_size=batch_size,
use_pyarrow=True,
) as (meta, reader):
for batch in reader:
if max_nfi_features is not None:
remaining = max_nfi_features - features_seen
if remaining <= 0:
break
if batch.num_rows > remaining:
batch = batch.slice(0, remaining)
features_seen += batch.num_rows
names = batch.schema.names
geometry_column = _geometry_column(meta, names)
category = np.asarray(
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
zero_copy_only=False
),
dtype=object,
)
geometry = np.asarray(
batch.column(names.index(geometry_column)).to_numpy(
zero_copy_only=False
),
dtype=object,
)
valid = category == NFI_WOODLAND_VALUE
if not valid.any():
continue
woodland_type = _column_or_none(batch, names, NFI_TYPE_COL)
area_ha = _column_or_none(batch, names, NFI_AREA_HA_COL)
geometries = shapely.from_wkb(geometry[valid])
geometries = shapely.transform(
geometries,
to_wgs84.transform,
interleaved=False,
)
geometries_json = shapely.to_geojson(geometries)
valid_indexes = np.flatnonzero(valid)
for idx, geometry_json in zip(valid_indexes, geometries_json):
area_sqm = (
_number_or_none(area_ha[idx] * 10000.0)
if area_ha is not None
else None
)
properties = {
"source": "nfi",
"tow_id": "",
"woodland_type": (
str(woodland_type[idx])
if woodland_type is not None
else ""
),
"area_sqm": area_sqm,
"mean_height_m": None,
"min_height_m": None,
"max_height_m": None,
"lidar_year": None,
"source_layer": layer,
}
feature = {
"type": "Feature",
"geometry": json.loads(geometry_json),
"properties": properties,
}
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
feature_count += 1
return feature_count
def build_tree_overlay_tiles( def build_tree_overlay_tiles(
tow_zip: Path, tow_zip: Path,
output_path: Path, output_path: Path,
extract_dir: Path, extract_dir: Path,
tow_types: tuple[str, ...],
batch_size: int, batch_size: int,
layer_names: tuple[str, ...] | None, layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None, max_features_per_layer: int | None,
@ -188,6 +282,9 @@ def build_tree_overlay_tiles(
max_zoom: int, max_zoom: int,
force_extract: bool, force_extract: bool,
use_vsizip: bool, use_vsizip: bool,
nfi_zip: Path | None = None,
nfi_extract_dir: Path = Path("property-data/nfi_woodland_england"),
max_nfi_features: int | None = None,
) -> None: ) -> None:
tippecanoe = _require_tippecanoe() tippecanoe = _require_tippecanoe()
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip) dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
@ -198,13 +295,26 @@ def build_tree_overlay_tiles(
feature_count = _write_tree_geojsonseq( feature_count = _write_tree_geojsonseq(
dataset_path, dataset_path,
ndjson_path, ndjson_path,
tow_types,
batch_size, batch_size,
layer_names, layer_names,
max_features_per_layer, max_features_per_layer,
) )
print(f"Writing {feature_count:,} TOW polygon features") print(f"Writing {feature_count:,} TOW polygon features")
if nfi_zip is not None and nfi_zip.exists():
nfi_path = _nfi_dataset_path(
nfi_zip, nfi_extract_dir, force_extract, use_vsizip
)
nfi_count = _append_nfi_geojsonseq(
nfi_path,
ndjson_path,
batch_size,
max_nfi_features,
)
print(f"Writing {nfi_count:,} NFI woodland polygon features")
elif nfi_zip is not None:
print(f"NFI zip not found, skipping woodland union: {nfi_zip}")
subprocess.run( subprocess.run(
[ [
tippecanoe, tippecanoe,
@ -237,26 +347,32 @@ def main() -> None:
default=Path("property-data/fr_tow_v1_all"), default=Path("property-data/fr_tow_v1_all"),
help="Directory used to extract the FileGDB", help="Directory used to extract the FileGDB",
) )
parser.add_argument(
"--tow-type",
action="append",
dest="tow_types",
help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
)
parser.add_argument("--batch-size", type=int, default=50_000) parser.add_argument("--batch-size", type=int, default=50_000)
parser.add_argument("--layer", action="append", dest="layers") parser.add_argument("--layer", action="append", dest="layers")
parser.add_argument("--max-features-per-layer", type=int) parser.add_argument("--max-features-per-layer", type=int)
parser.add_argument("--min-zoom", type=int, default=15) parser.add_argument("--min-zoom", type=int, default=12)
parser.add_argument("--max-zoom", type=int, default=17) parser.add_argument("--max-zoom", type=int, default=17)
parser.add_argument("--force-extract", action="store_true") parser.add_argument("--force-extract", action="store_true")
parser.add_argument("--use-vsizip", action="store_true") parser.add_argument("--use-vsizip", action="store_true")
parser.add_argument(
"--nfi-zip",
type=Path,
default=None,
help="Optional NFI woodland shapefile zip to union into the overlay",
)
parser.add_argument(
"--nfi-extract-dir",
type=Path,
default=Path("property-data/nfi_woodland_england"),
help="Directory used to extract the NFI zip",
)
parser.add_argument("--max-nfi-features", type=int)
args = parser.parse_args() args = parser.parse_args()
build_tree_overlay_tiles( build_tree_overlay_tiles(
tow_zip=args.tow_zip, tow_zip=args.tow_zip,
output_path=args.output, output_path=args.output,
extract_dir=args.extract_dir, extract_dir=args.extract_dir,
tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
batch_size=args.batch_size, batch_size=args.batch_size,
layer_names=tuple(args.layers) if args.layers else None, layer_names=tuple(args.layers) if args.layers else None,
max_features_per_layer=args.max_features_per_layer, max_features_per_layer=args.max_features_per_layer,
@ -264,6 +380,9 @@ def main() -> None:
max_zoom=args.max_zoom, max_zoom=args.max_zoom,
force_extract=args.force_extract, force_extract=args.force_extract,
use_vsizip=args.use_vsizip, use_vsizip=args.use_vsizip,
nfi_zip=args.nfi_zip,
nfi_extract_dir=args.nfi_extract_dir,
max_nfi_features=args.max_nfi_features,
) )

View file

@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
import sys import sys
import zipfile import zipfile
from pathlib import Path from pathlib import Path
@ -76,6 +77,24 @@ def _split_glob(spec: str) -> tuple[Path, str]:
return Path(base), pattern return Path(base), pattern
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
if "::" not in spec:
raise argparse.ArgumentTypeError(
f"{spec!r} must use LEFT::RIGHT for {label}"
)
left, right = spec.split("::", 1)
if not left or not right:
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
return Path(left), Path(right)
def _canonical_postcode(value: object) -> str:
compact = "".join(str(value).split()).upper()
if len(compact) >= 5:
return f"{compact[:-3]} {compact[-3:]}"
return compact
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]: def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
base, pattern = _split_glob(spec) base, pattern = _split_glob(spec)
if not base.exists(): if not base.exists():
@ -105,6 +124,79 @@ def _failures_for_zip_glob(spec: str) -> list[str]:
return failures return failures
def _postcode_column(columns: list[str]) -> str | None:
for name in ("postcode", "Postcode", "pcds", "PCDS"):
if name in columns:
return name
return None
def _parquet_postcodes(path: Path) -> set[str]:
schema = pl.scan_parquet(path).collect_schema()
column = _postcode_column(schema.names())
if column is None:
raise ValueError(f"{path}: missing postcode column")
values = (
pl.scan_parquet(path)
.select(pl.col(column).drop_nulls().unique())
.collect()
.get_column(column)
.to_list()
)
return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
def _boundary_postcodes(path: Path) -> set[str]:
units_dir = path / "units" if (path / "units").is_dir() else path
postcodes: set[str] = set()
for geojson_path in sorted(units_dir.glob("*.geojson")):
with geojson_path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
for feature in data.get("features", []):
properties = feature.get("properties") or {}
value = properties.get("postcodes")
if value is not None:
postcode = _canonical_postcode(value)
if postcode:
postcodes.add(postcode)
return postcodes
def _sample(values: set[str]) -> str:
return ", ".join(sorted(values)[:10])
def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
if failures:
return failures
try:
parquet_postcodes = _parquet_postcodes(parquet_path)
boundary_postcodes = _boundary_postcodes(boundaries_path)
except Exception as exc:
return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
failures = []
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
missing_boundaries = parquet_postcodes - boundary_postcodes
orphan_boundaries = boundary_postcodes - parquet_postcodes
if missing_boundaries:
failures.append(
f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
f"are missing boundaries; sample: {_sample(missing_boundaries)}"
)
if orphan_boundaries:
failures.append(
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
)
return failures
def main() -> int: def main() -> int:
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file", action="append", default=[], type=Path) parser.add_argument("--file", action="append", default=[], type=Path)
@ -123,6 +215,12 @@ def main() -> int:
default=[], default=[],
help="Require at least one readable zip matching BASE::PATTERN", help="Require at least one readable zip matching BASE::PATTERN",
) )
parser.add_argument(
"--postcode-boundary-match",
action="append",
default=[],
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
)
args = parser.parse_args() args = parser.parse_args()
failures: list[str] = [] failures: list[str] = []
@ -138,6 +236,8 @@ def main() -> int:
failures.extend(_failures_for_glob(spec)) failures.extend(_failures_for_glob(spec))
for spec in args.zip_glob: for spec in args.zip_glob:
failures.extend(_failures_for_zip_glob(spec)) failures.extend(_failures_for_zip_glob(spec))
for spec in args.postcode_boundary_match:
failures.extend(_failures_for_postcode_boundary_match(spec))
if failures: if failures:
print("Output validation failed:", file=sys.stderr) print("Output validation failed:", file=sys.stderr)