scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -22,6 +22,8 @@ video/auth.*
|
||||||
*.jpeg
|
*.jpeg
|
||||||
*.mp4
|
*.mp4
|
||||||
|
|
||||||
|
**/*.log
|
||||||
|
|
||||||
r5-java/tmp
|
r5-java/tmp
|
||||||
property-data
|
property-data
|
||||||
property-data2
|
property-data2
|
||||||
|
|
|
||||||
25
finder/Dockerfile
Normal file
25
finder/Dockerfile
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Finder scraper image. Runs via docker-compose sharing the media_gluetun VPN
|
||||||
|
# network namespace; the source tree is bind-mounted at runtime, so this image
|
||||||
|
# only needs the Python deps. The venv lives OUTSIDE the bind-mount target
|
||||||
|
# (/opt/venv) so the mount doesn't shadow it.
|
||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
|
||||||
|
UV_COMPILE_BYTECODE=1 \
|
||||||
|
UV_LINK_MODE=copy \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
|
|
||||||
|
WORKDIR /app/finder
|
||||||
|
|
||||||
|
# Install dependencies into /opt/venv (cached layer; project code is mounted at runtime).
|
||||||
|
COPY pyproject.toml uv.lock ./
|
||||||
|
RUN uv sync --no-install-project --frozen
|
||||||
|
|
||||||
|
# Source is bind-mounted over /app/finder by compose. `uv run` uses /opt/venv.
|
||||||
|
CMD ["sleep", "infinity"]
|
||||||
|
|
@ -6,7 +6,9 @@ REPO_DIR = FINDER_DIR.parent
|
||||||
|
|
||||||
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
|
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
|
||||||
ARCGIS_PATH = Path(
|
ARCGIS_PATH = Path(
|
||||||
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
|
os.environ.get(
|
||||||
|
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
|
||||||
|
)
|
||||||
)
|
)
|
||||||
PAGE_SIZE = 24
|
PAGE_SIZE = 24
|
||||||
DELAY_BETWEEN_PAGES = 0.3
|
DELAY_BETWEEN_PAGES = 0.3
|
||||||
|
|
@ -19,6 +21,19 @@ MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsi
|
||||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||||
|
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
|
||||||
|
# propertyData.address.{outcode,incode}, which together form the property's TRUE
|
||||||
|
# full postcode — the search API only exposes the outcode. {id} is the numeric
|
||||||
|
# listing id from the search response.
|
||||||
|
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
|
||||||
|
|
||||||
|
# The Rightmove search API gives only an outcode-level display address, so the
|
||||||
|
# true full postcode is recovered from each listing's detail page (see
|
||||||
|
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
|
||||||
|
# big throughput increase over the ~1000-result-per-outcode search, so detail
|
||||||
|
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
|
||||||
|
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
|
||||||
|
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
||||||
|
|
||||||
# OnTheMarket
|
# OnTheMarket
|
||||||
ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
||||||
|
|
@ -26,6 +41,41 @@ ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
||||||
# Zoopla
|
# Zoopla
|
||||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||||
|
|
||||||
|
# Zoopla search cards only carry an outcode-level address, so the full postcode
|
||||||
|
# and precise coordinates are scraped from each listing's detail page. These
|
||||||
|
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
|
||||||
|
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
|
||||||
|
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
||||||
|
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
|
||||||
|
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
|
||||||
|
# spent fetching details; the remainder is reserved for search pagination so
|
||||||
|
# detail fetches can never trip the timeout and discard collected listings.
|
||||||
|
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
|
||||||
|
|
||||||
|
# Gluetun VPN. Network endpoints are env-overridable because they are
|
||||||
|
# deployment-specific: when finder runs in a SEPARATE container they use the
|
||||||
|
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
|
||||||
|
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
|
||||||
|
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
|
||||||
|
# all traffic, so no HTTP proxy is needed).
|
||||||
|
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
|
||||||
|
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
|
||||||
|
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
|
||||||
|
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
||||||
|
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
|
||||||
|
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
|
||||||
|
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
|
||||||
|
# Raise only with residential IPs where rotation helps.
|
||||||
|
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
|
||||||
|
|
||||||
|
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
|
||||||
|
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
|
||||||
|
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
|
||||||
|
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
|
||||||
|
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
|
||||||
|
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
|
||||||
|
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
|
||||||
|
|
||||||
# Greater London-ish postcode areas. This intentionally uses broad area
|
# Greater London-ish postcode areas. This intentionally uses broad area
|
||||||
# prefixes so a manual scrape can include central/inner London plus common
|
# prefixes so a manual scrape can include central/inner London plus common
|
||||||
# outer-London and near-London outcodes without maintaining a long borough list.
|
# outer-London and near-London outcodes without maintaining a long borough list.
|
||||||
|
|
|
||||||
57
finder/docker-compose.yml
Normal file
57
finder/docker-compose.yml
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
# Finder scraper + FlareSolverr, both sharing the EXISTING media_gluetun VPN
|
||||||
|
# container's network namespace. Everything egresses through the VPN, and
|
||||||
|
# FlareSolverr solves Zoopla's Cloudflare automatically (no VNC needed).
|
||||||
|
#
|
||||||
|
# Prerequisites:
|
||||||
|
# - The `media_gluetun` container (qmcgaw/gluetun) is running on this host.
|
||||||
|
# It is managed by a different compose; it is referenced here as external
|
||||||
|
# via network_mode "container:media_gluetun".
|
||||||
|
# - Because these services share gluetun's netns, they reach each other and
|
||||||
|
# gluetun on localhost (flaresolverr :8191, gluetun control :8000) and need
|
||||||
|
# NO published ports (which is exactly why this avoids the dev-container
|
||||||
|
# port-forwarding pain).
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# cd finder
|
||||||
|
# docker compose up -d --build flaresolverr finder # start the sidecars
|
||||||
|
# docker compose exec finder uv run python main.py --source zoopla --outcodes SW9 --test
|
||||||
|
# docker compose exec finder uv run python main.py --source all # full run
|
||||||
|
# docker compose down
|
||||||
|
#
|
||||||
|
# NOTE: a manually-started `finder_flaresolverr` container from testing must be
|
||||||
|
# removed first (`docker rm -f finder_flaresolverr`) to avoid a name clash.
|
||||||
|
|
||||||
|
services:
|
||||||
|
flaresolverr:
|
||||||
|
image: ghcr.io/flaresolverr/flaresolverr:latest
|
||||||
|
container_name: finder_flaresolverr
|
||||||
|
network_mode: "container:media_gluetun"
|
||||||
|
environment:
|
||||||
|
LOG_LEVEL: info
|
||||||
|
TZ: Europe/London
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
finder:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: finder-scraper:latest
|
||||||
|
container_name: finder_scraper
|
||||||
|
network_mode: "container:media_gluetun"
|
||||||
|
depends_on:
|
||||||
|
- flaresolverr
|
||||||
|
volumes:
|
||||||
|
- .:/app/finder # live-mounted finder source
|
||||||
|
- ../property-data:/app/property-data:ro # ARCGIS postcode data
|
||||||
|
working_dir: /app/finder
|
||||||
|
environment:
|
||||||
|
# Shared netns: sidecars are on localhost, and the netns already tunnels
|
||||||
|
# all traffic through the VPN, so no HTTP proxy is used.
|
||||||
|
ZOOPLA_FETCHER: flaresolverr
|
||||||
|
FLARESOLVERR_URL: http://localhost:8191/v1
|
||||||
|
GLUETUN_CONTROL_URL: http://localhost:8000
|
||||||
|
GLUETUN_PROXY: "" # empty => direct (shared netns already tunnels)
|
||||||
|
DATA_DIR: /app/finder/data
|
||||||
|
ARCGIS_PATH: /app/property-data/arcgis_data.parquet
|
||||||
|
restart: "no"
|
||||||
|
command: ["sleep", "infinity"] # stays up; run scrapes via `docker compose exec`
|
||||||
91
finder/flaresolverr.py
Normal file
91
finder/flaresolverr.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
"""FlareSolverr client — fetch Cloudflare-protected pages as rendered HTML.
|
||||||
|
|
||||||
|
FlareSolverr (https://github.com/FlareSolverr/FlareSolverr) drives an
|
||||||
|
undetected browser to pass Cloudflare's challenge and returns the fully
|
||||||
|
rendered HTML. It runs as a sidecar service (see docker-compose.yml) sharing
|
||||||
|
the Gluetun VPN network namespace, so its browser egresses through the VPN.
|
||||||
|
|
||||||
|
Verified working against Zoopla's managed Turnstile on a datacenter VPN IP,
|
||||||
|
provided a reused session and a generous maxTimeout (~120s) — the first
|
||||||
|
challenge solve is slow, subsequent requests on the warm session are fast.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from constants import FLARESOLVERR_MAX_TIMEOUT_MS, FLARESOLVERR_URL
|
||||||
|
|
||||||
|
log = logging.getLogger("flaresolverr")
|
||||||
|
|
||||||
|
|
||||||
|
class FlareSolverrError(Exception):
|
||||||
|
"""Raised when FlareSolverr cannot fetch/solve a URL."""
|
||||||
|
|
||||||
|
|
||||||
|
class FlareSolverrSession:
|
||||||
|
"""A reusable FlareSolverr browser session (context manager).
|
||||||
|
|
||||||
|
Reusing one session keeps the cleared Cloudflare cookies warm across
|
||||||
|
requests, so only the first fetch pays the full challenge-solve cost."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str = FLARESOLVERR_URL,
|
||||||
|
session: str = "finder",
|
||||||
|
max_timeout_ms: int = FLARESOLVERR_MAX_TIMEOUT_MS,
|
||||||
|
) -> None:
|
||||||
|
self._url = url
|
||||||
|
self._session = session
|
||||||
|
self._max_timeout = max_timeout_ms
|
||||||
|
# Read timeout must comfortably exceed maxTimeout (FlareSolverr blocks
|
||||||
|
# for up to maxTimeout while solving before responding).
|
||||||
|
self._client = httpx.Client(timeout=httpx.Timeout(self._max_timeout / 1000 + 30))
|
||||||
|
self._active = False
|
||||||
|
|
||||||
|
def _post(self, payload: dict) -> dict:
|
||||||
|
try:
|
||||||
|
resp = self._client.post(self._url, json=payload)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
except (httpx.HTTPError, ValueError) as exc:
|
||||||
|
raise FlareSolverrError(
|
||||||
|
f"FlareSolverr request to {self._url} failed: {exc}"
|
||||||
|
) from exc
|
||||||
|
if data.get("status") != "ok":
|
||||||
|
raise FlareSolverrError(
|
||||||
|
f"FlareSolverr {payload.get('cmd')} failed: {data.get('message')}"
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def __enter__(self) -> "FlareSolverrSession":
|
||||||
|
# Start from a clean session (ignore destroy errors for a fresh name).
|
||||||
|
try:
|
||||||
|
self._post({"cmd": "sessions.destroy", "session": self._session})
|
||||||
|
except FlareSolverrError:
|
||||||
|
pass
|
||||||
|
self._post({"cmd": "sessions.create", "session": self._session})
|
||||||
|
self._active = True
|
||||||
|
log.info("FlareSolverr session %r ready at %s", self._session, self._url)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get(self, url: str) -> str:
|
||||||
|
"""Fetch a URL through FlareSolverr; return the solved HTML."""
|
||||||
|
data = self._post(
|
||||||
|
{
|
||||||
|
"cmd": "request.get",
|
||||||
|
"session": self._session,
|
||||||
|
"url": url,
|
||||||
|
"maxTimeout": self._max_timeout,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
solution = data.get("solution") or {}
|
||||||
|
return solution.get("response", "") or ""
|
||||||
|
|
||||||
|
def __exit__(self, *exc_info) -> None:
|
||||||
|
if self._active:
|
||||||
|
try:
|
||||||
|
self._post({"cmd": "sessions.destroy", "session": self._session})
|
||||||
|
except FlareSolverrError as exc:
|
||||||
|
log.debug("FlareSolverr session destroy failed: %s", exc)
|
||||||
|
self._client.close()
|
||||||
53
finder/gdal-ecw/Dockerfile
Normal file
53
finder/gdal-ecw/Dockerfile
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
# GDAL with ECW (read) support, for decoding Environment Agency Vertical Aerial
|
||||||
|
# Photography in the satellite-highres pipeline (pipeline/download/satellite_highres.py).
|
||||||
|
#
|
||||||
|
# EA VAP ships as ECW **v2** rasters, which are readable by the open-source
|
||||||
|
# libecwj2 3.3 SDK -- the same library the official OSGeo image uses when built
|
||||||
|
# with WITH_ECW=yes. We therefore avoid the proprietary, login-gated Hexagon
|
||||||
|
# ERDAS ECW/JP2 SDK (which is only needed for ECW v3) and its licensing
|
||||||
|
# restrictions entirely.
|
||||||
|
#
|
||||||
|
# We build only the ECW driver as a GDAL *plugin* on top of the official runtime
|
||||||
|
# image (no full GDAL rebuild). The plugin's GDAL sources are pinned to the exact
|
||||||
|
# commit reported by the base image so libgdal and the plugin stay ABI-compatible.
|
||||||
|
#
|
||||||
|
# Build: docker build -t perfect-postcode/gdal-ecw:latest docker/gdal-ecw
|
||||||
|
# Verify: docker run --rm perfect-postcode/gdal-ecw:latest gdalinfo --formats | grep -i ECW
|
||||||
|
|
||||||
|
FROM ghcr.io/osgeo/gdal:ubuntu-full-latest
|
||||||
|
|
||||||
|
ARG LIBECWJ2_URL=https://github.com/rouault/libecwj2-3.3-builds/releases/download/v1/install-libecwj2-3.3-ubuntu-20.04.tar.gz
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
cmake g++ make git curl ca-certificates \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Open-source ECW v2 SDK (extracts to /opt/libecwj2-3.3) + make its libs loadable.
|
||||||
|
RUN curl --retry 3 --retry-all-errors --retry-delay 3 -fsSL -o /tmp/libecwj2.tar.gz "$LIBECWJ2_URL" \
|
||||||
|
&& tar -C / -xzf /tmp/libecwj2.tar.gz \
|
||||||
|
&& rm -f /tmp/libecwj2.tar.gz \
|
||||||
|
&& (cd /opt/libecwj2-3.3/lib && for so in *.so*; do \
|
||||||
|
ln -sf "/opt/libecwj2-3.3/lib/$so" "/usr/lib/x86_64-linux-gnu/$so"; \
|
||||||
|
done) \
|
||||||
|
&& ldconfig
|
||||||
|
|
||||||
|
# Build the ECW driver plugin against the base image's exact GDAL sources.
|
||||||
|
RUN set -eux; \
|
||||||
|
GDAL_COMMIT="$(gdalinfo --version | sed -nE 's/.*-([0-9a-f]{8,}).*/\1/p')"; \
|
||||||
|
test -n "$GDAL_COMMIT"; \
|
||||||
|
echo "Building ECW plugin for GDAL commit ${GDAL_COMMIT}"; \
|
||||||
|
mkdir -p /tmp/gdal && cd /tmp/gdal && git init -q; \
|
||||||
|
git fetch --depth 1 -q https://github.com/OSGeo/gdal.git "$GDAL_COMMIT"; \
|
||||||
|
git checkout -q FETCH_HEAD; \
|
||||||
|
cmake -S frmts/ecw -B /tmp/ecw-build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DCMAKE_PREFIX_PATH=/usr \
|
||||||
|
-DECW_ROOT=/opt/libecwj2-3.3; \
|
||||||
|
cmake --build /tmp/ecw-build -j"$(nproc)"; \
|
||||||
|
PLUGIN_DIR=/usr/lib/x86_64-linux-gnu/gdalplugins; \
|
||||||
|
mkdir -p "$PLUGIN_DIR"; \
|
||||||
|
find /tmp/ecw-build -name 'gdal_ECW*.so' -exec cp {} "$PLUGIN_DIR/" \; ; \
|
||||||
|
rm -rf /tmp/gdal /tmp/ecw-build
|
||||||
|
|
||||||
|
# Fail the build if the driver is not actually available.
|
||||||
|
RUN gdalinfo --formats | grep -iq 'ECW.*rw' && echo "ECW driver OK"
|
||||||
|
|
@ -5,7 +5,7 @@ import time
|
||||||
import httpx
|
import httpx
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
|
|
||||||
from constants import MAX_RETRIES, RETRY_BASE_DELAY
|
from constants import GLUETUN_PROXY, MAX_RETRIES, RETRY_BASE_DELAY
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
|
@ -15,10 +15,12 @@ _ua = UserAgent(
|
||||||
|
|
||||||
|
|
||||||
def make_client() -> httpx.Client:
|
def make_client() -> httpx.Client:
|
||||||
|
# Route through the Gluetun HTTP proxy (VPN egress) when configured.
|
||||||
return httpx.Client(
|
return httpx.Client(
|
||||||
timeout=30,
|
timeout=30,
|
||||||
headers={"User-Agent": _ua.random, "Accept": "application/json"},
|
headers={"User-Agent": _ua.random, "Accept": "application/json"},
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
|
proxy=GLUETUN_PROXY or None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
|
||||||
default=DATA_DIR,
|
default=DATA_DIR,
|
||||||
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
|
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outcodes",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
|
||||||
|
"instead of the full Greater London set. Must fall within the "
|
||||||
|
"London-ish areas; takes precedence over --test/--limit-outcodes."
|
||||||
|
),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--limit-outcodes",
|
"--limit-outcodes",
|
||||||
type=int,
|
type=int,
|
||||||
|
|
@ -116,17 +126,32 @@ def main() -> int:
|
||||||
from scraper import (
|
from scraper import (
|
||||||
build_postcode_coords,
|
build_postcode_coords,
|
||||||
build_postcode_index,
|
build_postcode_index,
|
||||||
|
filter_londonish_outcodes,
|
||||||
load_outcodes,
|
load_outcodes,
|
||||||
run_scrape,
|
run_scrape,
|
||||||
)
|
)
|
||||||
|
|
||||||
outcodes = load_outcodes()
|
if args.outcodes is not None:
|
||||||
if args.test and args.limit_outcodes is None:
|
requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
|
||||||
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
if not requested:
|
||||||
if preferred:
|
raise SystemExit("--outcodes was empty")
|
||||||
outcodes = preferred
|
outcodes = filter_londonish_outcodes(requested)
|
||||||
if args.limit_outcodes is not None:
|
dropped = sorted(set(requested) - set(outcodes))
|
||||||
outcodes = outcodes[: args.limit_outcodes]
|
if dropped:
|
||||||
|
log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
|
||||||
|
if not outcodes:
|
||||||
|
raise SystemExit(
|
||||||
|
"None of the requested outcodes are within the Greater London-ish areas "
|
||||||
|
f"({', '.join(requested)})."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
outcodes = load_outcodes()
|
||||||
|
if args.test and args.limit_outcodes is None:
|
||||||
|
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
||||||
|
if preferred:
|
||||||
|
outcodes = preferred
|
||||||
|
if args.limit_outcodes is not None:
|
||||||
|
outcodes = outcodes[: args.limit_outcodes]
|
||||||
|
|
||||||
if not outcodes:
|
if not outcodes:
|
||||||
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
|
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,30 @@ Each rendered page contains 30 listings under
|
||||||
`humanised-property-type`, `features` (a list where the first element is
|
`humanised-property-type`, `features` (a list where the first element is
|
||||||
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
|
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
|
||||||
`?page=N`; the loop terminates when `paginationControls.next` is null.
|
`?page=N`; the loop terminates when `paginationControls.next` is null.
|
||||||
|
|
||||||
|
Postcodes
|
||||||
|
---------
|
||||||
|
The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
|
||||||
|
London, SE5") and a map pin, so the old behaviour derived the postcode from the
|
||||||
|
nearest postcode to that pin — a guess that frequently lands on a neighbouring
|
||||||
|
unit (the pin can sit on the wrong side of a street boundary).
|
||||||
|
|
||||||
|
Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
|
||||||
|
embeds the property's analytics dataLayer at
|
||||||
|
`props.initialReduxState.metadata.dataLayer`, which carries the property's own
|
||||||
|
`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
|
||||||
|
`property-id`. Crucially this is NOT the agent's office postcode — that lives
|
||||||
|
separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and
|
||||||
|
is the classic trap when blindly scanning the page for a postcode. We read the
|
||||||
|
dataLayer postcode, verify `property-id` matches the listing, and accept it only
|
||||||
|
when its outcode agrees with the coordinate-nearest postcode (via
|
||||||
|
``resolve_listing_postcode``) — exactly the trust rule the other scrapers use.
|
||||||
|
Measured over a sample of real listings this yields a trustworthy, usually
|
||||||
|
exact-unit postcode for ~11/12 listings; the rest safely fall back to the
|
||||||
|
coordinate-nearest postcode.
|
||||||
|
|
||||||
|
Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
|
||||||
|
``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
@ -31,14 +55,26 @@ from spatial import PostcodeSpatialIndex
|
||||||
from transform import (
|
from transform import (
|
||||||
clean_listing_address,
|
clean_listing_address,
|
||||||
extract_full_postcode,
|
extract_full_postcode,
|
||||||
|
extract_outcode,
|
||||||
fix_coords,
|
fix_coords,
|
||||||
map_property_type,
|
map_property_type,
|
||||||
normalize_sub_type,
|
normalize_sub_type,
|
||||||
parse_display_size,
|
parse_display_size,
|
||||||
|
resolve_listing_postcode,
|
||||||
)
|
)
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
# Detail-page postcode recovery (see module docstring). When enabled, each
|
||||||
|
# listing's detail page is fetched so its analytics dataLayer postcode — the
|
||||||
|
# property's own full unit postcode — can replace the coordinate-nearest guess.
|
||||||
|
# Bounded per outcode so a large outcode can't balloon into unbounded extra
|
||||||
|
# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
|
||||||
|
# typical outcode's listings all get their real postcode rather than a
|
||||||
|
# coordinate-nearest guess.
|
||||||
|
OTM_FETCH_DETAILS = True
|
||||||
|
OTM_MAX_DETAILS_PER_OUTCODE = 400
|
||||||
|
|
||||||
_NEXT_DATA_RE = re.compile(
|
_NEXT_DATA_RE = re.compile(
|
||||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
re.DOTALL,
|
re.DOTALL,
|
||||||
|
|
@ -51,6 +87,11 @@ _HTML_HEADERS = {
|
||||||
"Accept-Language": "en-GB,en;q=0.9",
|
"Accept-Language": "en-GB,en;q=0.9",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# listingId -> recovered full postcode (or None). Failures are cached too so a
|
||||||
|
# broken or postcode-less detail page is not re-fetched within a run (the same
|
||||||
|
# listing can reappear across overlapping outcode searches).
|
||||||
|
_detail_postcode_cache: dict[str, str | None] = {}
|
||||||
|
|
||||||
|
|
||||||
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
|
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
|
||||||
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
|
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
|
||||||
|
|
@ -119,6 +160,116 @@ def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
|
||||||
|
"""Extract the property's own full postcode from an OnTheMarket detail page.
|
||||||
|
|
||||||
|
Pure and network-free so it is unit-testable: callers pass `page.content()`
|
||||||
|
/ the GET body and this does the parsing.
|
||||||
|
|
||||||
|
The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
|
||||||
|
``props.initialReduxState.metadata.dataLayer.postcode`` and is the
|
||||||
|
property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
|
||||||
|
agent's office postcode, which sits separately at
|
||||||
|
``…property.agent.postcode`` — the trap when scanning a detail page for "a"
|
||||||
|
postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
|
||||||
|
match it, guaranteeing we read this listing's postcode and not a stray one.
|
||||||
|
|
||||||
|
Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
|
||||||
|
page has no usable property postcode. Trust (outcode-vs-coordinates
|
||||||
|
agreement) is enforced later in ``transform_property``.
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
match = _NEXT_DATA_RE.search(html)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
|
||||||
|
except (KeyError, TypeError):
|
||||||
|
return None
|
||||||
|
if not isinstance(data_layer, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Guard against reading a different listing's postcode: the dataLayer is the
|
||||||
|
# property's own analytics payload, so its property-id must match.
|
||||||
|
if listing_id is not None:
|
||||||
|
page_id = data_layer.get("property-id")
|
||||||
|
if page_id is not None and str(page_id) != str(listing_id):
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_postcode = data_layer.get("postcode")
|
||||||
|
if not isinstance(raw_postcode, str):
|
||||||
|
return None
|
||||||
|
return extract_full_postcode(raw_postcode)
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_detail_postcode(
|
||||||
|
client: httpx.Client, details_url: str, listing_id: str
|
||||||
|
) -> str | None:
|
||||||
|
"""GET one listing's detail page and return its dataLayer postcode (or None).
|
||||||
|
|
||||||
|
Results (including failures) are cached by listing id so a listing that
|
||||||
|
reappears across overlapping outcode searches is fetched at most once. Plain
|
||||||
|
HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network /
|
||||||
|
parse errors degrade gracefully to None so the caller falls back to the
|
||||||
|
coordinate-nearest postcode.
|
||||||
|
"""
|
||||||
|
if listing_id in _detail_postcode_cache:
|
||||||
|
return _detail_postcode_cache[listing_id]
|
||||||
|
|
||||||
|
full_url = (
|
||||||
|
ONTHEMARKET_BASE + details_url
|
||||||
|
if details_url and not details_url.startswith("http")
|
||||||
|
else details_url
|
||||||
|
)
|
||||||
|
result: str | None = None
|
||||||
|
if full_url:
|
||||||
|
for attempt in range(MAX_RETRIES):
|
||||||
|
try:
|
||||||
|
resp = client.get(
|
||||||
|
full_url, headers=_HTML_HEADERS, follow_redirects=True
|
||||||
|
)
|
||||||
|
except (
|
||||||
|
httpx.ConnectError,
|
||||||
|
httpx.ReadTimeout,
|
||||||
|
httpx.WriteTimeout,
|
||||||
|
httpx.PoolTimeout,
|
||||||
|
) as exc:
|
||||||
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||||
|
log.warning(
|
||||||
|
"%s from %s, retry %d/%d in %.1fs",
|
||||||
|
type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
result = parse_detail_postcode(resp.text, listing_id)
|
||||||
|
break
|
||||||
|
if resp.status_code in (429, 500, 502, 503, 504):
|
||||||
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||||
|
log.warning(
|
||||||
|
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||||
|
resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
|
log.debug(
|
||||||
|
"OnTheMarket detail %s returned HTTP %d (no postcode)",
|
||||||
|
listing_id, resp.status_code,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
_detail_postcode_cache[listing_id] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _parse_price(price_value) -> int:
|
def _parse_price(price_value) -> int:
|
||||||
"""Parse a formatted price string like '£450,000' into an integer.
|
"""Parse a formatted price string like '£450,000' into an integer.
|
||||||
Returns 0 for POA/auction/null values."""
|
Returns 0 for POA/auction/null values."""
|
||||||
|
|
@ -166,9 +317,19 @@ def _extract_floor_area(features: list) -> float | None:
|
||||||
|
|
||||||
|
|
||||||
def transform_property(
|
def transform_property(
|
||||||
raw: dict, pc_index: PostcodeSpatialIndex
|
raw: dict,
|
||||||
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
detail_postcode: str | None = None,
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
"""Transform a raw OnTheMarket listing dict into our output schema."""
|
"""Transform a raw OnTheMarket listing dict into our output schema.
|
||||||
|
|
||||||
|
``detail_postcode`` is the property's own full postcode recovered from its
|
||||||
|
detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
|
||||||
|
or ``None`` when no detail fetch was done / no postcode was found. When
|
||||||
|
present and trustworthy (its outcode agrees with the coordinate-nearest
|
||||||
|
postcode) it supersedes the coordinate guess and is labelled
|
||||||
|
``"detail_address"``.
|
||||||
|
"""
|
||||||
loc = raw.get("location") or {}
|
loc = raw.get("location") or {}
|
||||||
raw_lat = loc.get("lat")
|
raw_lat = loc.get("lat")
|
||||||
raw_lng = loc.get("lon")
|
raw_lng = loc.get("lon")
|
||||||
|
|
@ -184,8 +345,29 @@ def transform_property(
|
||||||
return None
|
return None
|
||||||
raw_address = raw.get("address", "") or ""
|
raw_address = raw.get("address", "") or ""
|
||||||
extracted_postcode = extract_full_postcode(raw_address)
|
extracted_postcode = extract_full_postcode(raw_address)
|
||||||
postcode = extracted_postcode or inferred_postcode
|
|
||||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
# Prefer the property's own detail-page postcode when we have one and it is
|
||||||
|
# trustworthy. The detail postcode is a full unit postcode (better than the
|
||||||
|
# coordinate-nearest guess and than the usually outcode-only card address),
|
||||||
|
# but a stale/mislabelled value would silently override the spatially
|
||||||
|
# correct one, so apply the same outcode-agreement trust rule the address
|
||||||
|
# postcode uses: keep it only when its outcode matches the
|
||||||
|
# coordinate-nearest postcode's outcode.
|
||||||
|
detail_postcode = extract_full_postcode(detail_postcode)
|
||||||
|
if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
|
||||||
|
inferred_postcode
|
||||||
|
):
|
||||||
|
postcode, postcode_source = detail_postcode, "detail_address"
|
||||||
|
else:
|
||||||
|
if detail_postcode:
|
||||||
|
log.debug(
|
||||||
|
"OnTheMarket %s: rejecting detail postcode %s "
|
||||||
|
"(outcode mismatch with inferred %s)",
|
||||||
|
raw.get("id", "?"), detail_postcode, inferred_postcode,
|
||||||
|
)
|
||||||
|
postcode, postcode_source = resolve_listing_postcode(
|
||||||
|
extracted_postcode, inferred_postcode
|
||||||
|
)
|
||||||
|
|
||||||
raw_beds = raw.get("bedrooms") or 0
|
raw_beds = raw.get("bedrooms") or 0
|
||||||
raw_baths = raw.get("bathrooms") or 0
|
raw_baths = raw.get("bathrooms") or 0
|
||||||
|
|
@ -223,6 +405,10 @@ def transform_property(
|
||||||
"Inferred postcode": inferred_postcode,
|
"Inferred postcode": inferred_postcode,
|
||||||
"Listing raw address": raw_address,
|
"Listing raw address": raw_address,
|
||||||
"Address per Property Register": clean_listing_address(raw_address),
|
"Address per Property Register": clean_listing_address(raw_address),
|
||||||
|
# OnTheMarket search JSON exposes only a street-level address; no UPRN
|
||||||
|
# or house number/name is available without a detail-page fetch.
|
||||||
|
"UPRN": None,
|
||||||
|
"Property number or name": None,
|
||||||
"Leasehold/Freehold": _extract_tenure(features),
|
"Leasehold/Freehold": _extract_tenure(features),
|
||||||
"Property type": map_property_type(sub_type),
|
"Property type": map_property_type(sub_type),
|
||||||
"Property sub-type": normalize_sub_type(sub_type),
|
"Property sub-type": normalize_sub_type(sub_type),
|
||||||
|
|
@ -242,10 +428,17 @@ def search_outcode(
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Paginate through OnTheMarket sale results for one outcode."""
|
"""Paginate through OnTheMarket sale results for one outcode.
|
||||||
|
|
||||||
|
When ``OTM_FETCH_DETAILS`` is enabled, up to
|
||||||
|
``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
|
||||||
|
fetched for the property's own postcode (see ``_fetch_detail_postcode``);
|
||||||
|
the rest fall back to the coordinate-nearest postcode.
|
||||||
|
"""
|
||||||
properties: list[dict] = []
|
properties: list[dict] = []
|
||||||
seen_ids: set[str] = set()
|
seen_ids: set[str] = set()
|
||||||
page_num = 1
|
page_num = 1
|
||||||
|
details_fetched = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
data = _fetch_page_json(client, outcode, page_num)
|
data = _fetch_page_json(client, outcode, page_num)
|
||||||
|
|
@ -269,8 +462,22 @@ def search_outcode(
|
||||||
if listing_id and listing_id in seen_ids:
|
if listing_id and listing_id in seen_ids:
|
||||||
continue
|
continue
|
||||||
seen_ids.add(listing_id)
|
seen_ids.add(listing_id)
|
||||||
|
|
||||||
|
detail_postcode = None
|
||||||
|
if OTM_FETCH_DETAILS and listing_id:
|
||||||
|
# Cached lookups are free; only fresh GETs count toward the cap
|
||||||
|
# and incur the inter-request delay.
|
||||||
|
cached = listing_id in _detail_postcode_cache
|
||||||
|
if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
|
||||||
|
detail_postcode = _fetch_detail_postcode(
|
||||||
|
client, raw.get("details-url") or "", listing_id
|
||||||
|
)
|
||||||
|
if not cached:
|
||||||
|
details_fetched += 1
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
transformed = transform_property(raw, pc_index)
|
transformed = transform_property(raw, pc_index, detail_postcode)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log.warning(
|
log.warning(
|
||||||
"OnTheMarket %s property %s failed to transform: %s",
|
"OnTheMarket %s property %s failed to transform: %s",
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
@ -6,12 +8,15 @@ import httpx
|
||||||
from constants import (
|
from constants import (
|
||||||
PAGE_SIZE,
|
PAGE_SIZE,
|
||||||
DELAY_BETWEEN_PAGES,
|
DELAY_BETWEEN_PAGES,
|
||||||
|
RIGHTMOVE_DETAIL_URL,
|
||||||
|
RIGHTMOVE_FETCH_DETAILS,
|
||||||
|
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||||||
SEARCH_URL,
|
SEARCH_URL,
|
||||||
TYPEAHEAD_URL,
|
TYPEAHEAD_URL,
|
||||||
)
|
)
|
||||||
from http_client import fetch_with_retry
|
from http_client import fetch_with_retry
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import transform_property
|
from transform import extract_full_postcode, normalize_postcode, transform_property
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
|
@ -23,6 +28,176 @@ outcode_cache: dict[str, str] = {}
|
||||||
_MAX_INDEX = 1008
|
_MAX_INDEX = 1008
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Detail-page postcode extraction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# The search API (_paginate) only returns an outcode-level `displayAddress`
|
||||||
|
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
|
||||||
|
# listing's detail page, however, embeds the property's OWN full postcode in a
|
||||||
|
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
|
||||||
|
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
|
||||||
|
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
|
||||||
|
# This is the property's own postcode, NOT a nearest station/school: the
|
||||||
|
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
|
||||||
|
# postcodes, and the address outcode always matches the searched outcode.
|
||||||
|
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
|
||||||
|
# new-builds) found the full postcode present 100% of the time. There is no
|
||||||
|
# UPRN or house-number field anywhere in propertyData, so those stay None.
|
||||||
|
#
|
||||||
|
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
|
||||||
|
# a JSON STRING holding a flat array where every integer inside a container is
|
||||||
|
# an index reference into that same array (so the graph can dedupe). We
|
||||||
|
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
|
||||||
|
# cannot — then rehydrate the reference graph before reading the address.
|
||||||
|
|
||||||
|
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_page_model_literal(html: str) -> str | None:
|
||||||
|
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
|
||||||
|
|
||||||
|
Brace-matches with string/escape awareness so embedded braces and quotes in
|
||||||
|
string values don't end the match early. Returns None when absent."""
|
||||||
|
marker = _PAGE_MODEL_RE.search(html)
|
||||||
|
if not marker:
|
||||||
|
return None
|
||||||
|
start = marker.end()
|
||||||
|
if start >= len(html) or html[start] != "{":
|
||||||
|
return None
|
||||||
|
depth = 0
|
||||||
|
in_str = False
|
||||||
|
esc = False
|
||||||
|
for j in range(start, len(html)):
|
||||||
|
ch = html[j]
|
||||||
|
if in_str:
|
||||||
|
if esc:
|
||||||
|
esc = False
|
||||||
|
elif ch == "\\":
|
||||||
|
esc = True
|
||||||
|
elif ch == '"':
|
||||||
|
in_str = False
|
||||||
|
elif ch == '"':
|
||||||
|
in_str = True
|
||||||
|
elif ch == "{":
|
||||||
|
depth += 1
|
||||||
|
elif ch == "}":
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
return html[start : j + 1]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _rehydrate(flat: list) -> object:
|
||||||
|
"""Resolve a devalue-style flattened reference array into a nested object.
|
||||||
|
|
||||||
|
Index 0 is the root; every int inside a dict/list is an index back into
|
||||||
|
``flat``. Memoised so shared/cyclic references resolve once."""
|
||||||
|
cache: dict[int, object] = {}
|
||||||
|
|
||||||
|
def resolve(idx: int) -> object:
|
||||||
|
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
|
||||||
|
return None
|
||||||
|
if idx in cache:
|
||||||
|
return cache[idx]
|
||||||
|
node = flat[idx]
|
||||||
|
if isinstance(node, dict):
|
||||||
|
out: dict = {}
|
||||||
|
cache[idx] = out
|
||||||
|
for key, value in node.items():
|
||||||
|
out[key] = resolve(value) if isinstance(value, int) else value
|
||||||
|
return out
|
||||||
|
if isinstance(node, list):
|
||||||
|
arr: list = []
|
||||||
|
cache[idx] = arr
|
||||||
|
for value in node:
|
||||||
|
arr.append(resolve(value) if isinstance(value, int) else value)
|
||||||
|
return arr
|
||||||
|
cache[idx] = node
|
||||||
|
return node
|
||||||
|
|
||||||
|
return resolve(0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_postcode(html: str) -> str | None:
|
||||||
|
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
|
||||||
|
|
||||||
|
Pure and network-free so it is unit-testable: callers pass the page HTML.
|
||||||
|
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
|
||||||
|
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
|
||||||
|
page has no parseable address (the property location wrapper can be empty —
|
||||||
|
the caller then keeps the coordinate fallback). The returned outcode is
|
||||||
|
re-validated against the joined postcode so a malformed incode is dropped.
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
literal = _extract_page_model_literal(html)
|
||||||
|
if not literal:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
outer = json.loads(literal)
|
||||||
|
flat = json.loads(outer["data"])
|
||||||
|
except (ValueError, KeyError, TypeError):
|
||||||
|
return None
|
||||||
|
if not isinstance(flat, list) or not flat:
|
||||||
|
return None
|
||||||
|
|
||||||
|
root = _rehydrate(flat)
|
||||||
|
if not isinstance(root, dict):
|
||||||
|
return None
|
||||||
|
property_data = root.get("propertyData")
|
||||||
|
if not isinstance(property_data, dict):
|
||||||
|
return None
|
||||||
|
address = property_data.get("address")
|
||||||
|
if not isinstance(address, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
outcode = address.get("outcode")
|
||||||
|
incode = address.get("incode")
|
||||||
|
if not isinstance(outcode, str) or not isinstance(incode, str):
|
||||||
|
return None
|
||||||
|
outcode, incode = outcode.strip(), incode.strip()
|
||||||
|
if not outcode or not incode:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Round-trip through the shared postcode validator/normaliser: this both
|
||||||
|
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
|
||||||
|
# a structurally-valid UK postcode.
|
||||||
|
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
|
||||||
|
|
||||||
|
|
||||||
|
# listingId -> true full postcode (or None when unavailable). Failures are
|
||||||
|
# cached too, so a broken/duplicate listing is fetched at most once per run (the
|
||||||
|
# same listing can reappear across overlapping outcode searches).
|
||||||
|
_detail_postcode_cache: dict[str, str | None] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
|
||||||
|
"""GET a listing detail page and return its true full postcode (or None).
|
||||||
|
|
||||||
|
Results (including failures) are cached by listing id. The detail page is a
|
||||||
|
plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
|
||||||
|
suffices; any error degrades gracefully to the coordinate fallback."""
|
||||||
|
if not property_id:
|
||||||
|
return None
|
||||||
|
if property_id in _detail_postcode_cache:
|
||||||
|
return _detail_postcode_cache[property_id]
|
||||||
|
|
||||||
|
postcode: str | None = None
|
||||||
|
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
|
||||||
|
try:
|
||||||
|
resp = client.get(url, headers={"Accept": "text/html"})
|
||||||
|
if resp.status_code == 200:
|
||||||
|
postcode = parse_detail_postcode(resp.text)
|
||||||
|
else:
|
||||||
|
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
|
||||||
|
|
||||||
|
_detail_postcode_cache[property_id] = postcode
|
||||||
|
return postcode
|
||||||
|
|
||||||
|
|
||||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||||
if outcode in outcode_cache:
|
if outcode in outcode_cache:
|
||||||
|
|
@ -44,6 +219,31 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _detail_postcode_for(
|
||||||
|
client: httpx.Client,
|
||||||
|
prop: dict,
|
||||||
|
fetch_details: bool,
|
||||||
|
detail_budget: dict,
|
||||||
|
) -> str | None:
|
||||||
|
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
|
||||||
|
|
||||||
|
Cached listings are always served (they cost neither a cap slot nor a GET);
|
||||||
|
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
|
||||||
|
if not fetch_details:
|
||||||
|
return None
|
||||||
|
property_id = str(prop.get("id") or "")
|
||||||
|
if not property_id:
|
||||||
|
return None
|
||||||
|
if property_id in _detail_postcode_cache:
|
||||||
|
return _detail_postcode_cache[property_id]
|
||||||
|
if detail_budget["remaining"] <= 0:
|
||||||
|
return None
|
||||||
|
detail_budget["remaining"] -= 1
|
||||||
|
postcode = _fetch_detail_postcode(client, property_id)
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
return postcode
|
||||||
|
|
||||||
|
|
||||||
def _paginate(
|
def _paginate(
|
||||||
client: httpx.Client,
|
client: httpx.Client,
|
||||||
outcode_id: str,
|
outcode_id: str,
|
||||||
|
|
@ -51,11 +251,19 @@ def _paginate(
|
||||||
channel_cfg: dict,
|
channel_cfg: dict,
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
|
fetch_details: bool = False,
|
||||||
|
detail_cap: int = 0,
|
||||||
) -> tuple[list[dict], int]:
|
) -> tuple[list[dict], int]:
|
||||||
"""Paginate through search results. Returns (properties, result_count)."""
|
"""Paginate through search results. Returns (properties, result_count).
|
||||||
|
|
||||||
|
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
|
||||||
|
their detail page fetched for the property's TRUE full postcode (see
|
||||||
|
``parse_detail_postcode``); the rest fall back to coordinate-derived
|
||||||
|
postcodes."""
|
||||||
properties = []
|
properties = []
|
||||||
index = 0
|
index = 0
|
||||||
result_count = 0
|
result_count = 0
|
||||||
|
detail_budget = {"remaining": detail_cap}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
params = {
|
params = {
|
||||||
|
|
@ -82,7 +290,12 @@ def _paginate(
|
||||||
|
|
||||||
for prop in raw_props:
|
for prop in raw_props:
|
||||||
try:
|
try:
|
||||||
transformed = transform_property(prop, outcode, pc_index)
|
detail_postcode = _detail_postcode_for(
|
||||||
|
client, prop, fetch_details, detail_budget
|
||||||
|
)
|
||||||
|
transformed = transform_property(
|
||||||
|
prop, outcode, pc_index, detail_postcode=detail_postcode
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log.warning(
|
log.warning(
|
||||||
"Rightmove %s/%s property %s failed to transform: %s",
|
"Rightmove %s/%s property %s failed to transform: %s",
|
||||||
|
|
@ -127,7 +340,12 @@ def search_outcode(
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Paginate through unfiltered sale results for one outcode+channel."""
|
"""Paginate through unfiltered sale results for one outcode+channel.
|
||||||
|
|
||||||
|
Each listing's detail page is fetched for the property's TRUE full postcode
|
||||||
|
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
|
||||||
|
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
|
||||||
|
coordinate-derived postcode."""
|
||||||
properties, _ = _paginate(
|
properties, _ = _paginate(
|
||||||
client,
|
client,
|
||||||
outcode_id,
|
outcode_id,
|
||||||
|
|
@ -135,6 +353,8 @@ def search_outcode(
|
||||||
channel_cfg,
|
channel_cfg,
|
||||||
pc_index,
|
pc_index,
|
||||||
max_properties=max_properties,
|
max_properties=max_properties,
|
||||||
|
fetch_details=RIGHTMOVE_FETCH_DETAILS,
|
||||||
|
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
if max_properties is not None and len(properties) >= max_properties:
|
if max_properties is not None and len(properties) >= max_properties:
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,10 @@ from constants import (
|
||||||
DATA_DIR,
|
DATA_DIR,
|
||||||
DELAY_BETWEEN_OUTCODES,
|
DELAY_BETWEEN_OUTCODES,
|
||||||
LONDON_OUTCODE_PREFIXES,
|
LONDON_OUTCODE_PREFIXES,
|
||||||
|
ZOOPLA_DETAIL_BUDGET_FRACTION,
|
||||||
|
ZOOPLA_FETCH_DETAILS,
|
||||||
|
ZOOPLA_FETCHER,
|
||||||
|
ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
from http_client import make_client
|
from http_client import make_client
|
||||||
|
|
@ -371,6 +375,36 @@ def _zoopla_outcode_timeout_seconds() -> int:
|
||||||
return timeout
|
return timeout
|
||||||
|
|
||||||
|
|
||||||
|
def _zoopla_detail_cap() -> int:
|
||||||
|
"""Max detail-page fetches per outcode (0 disables detail fetching).
|
||||||
|
|
||||||
|
Zoopla search cards only expose an outcode-level address, so the full
|
||||||
|
postcode/coordinates come from each listing's detail page. The cap bounds
|
||||||
|
the extra page loads so an outcode stays within ZOOPLA_OUTCODE_TIMEOUT_SECONDS
|
||||||
|
(the per-outcode SIGALRM budget covers the detail fetches too). Configure via
|
||||||
|
ZOOPLA_FETCH_DETAILS / ZOOPLA_MAX_DETAILS_PER_OUTCODE in constants.py."""
|
||||||
|
return ZOOPLA_MAX_DETAILS_PER_OUTCODE if ZOOPLA_FETCH_DETAILS else 0
|
||||||
|
|
||||||
|
|
||||||
|
def _open_zoopla_detail_tab(page, detail_cap: int):
|
||||||
|
"""Open a second tab on the same context for detail-page fetches.
|
||||||
|
|
||||||
|
Sharing the persistent context means the detail tab inherits the search
|
||||||
|
tab's Cloudflare clearance cookies. Returns None when detail fetching is
|
||||||
|
disabled or the tab cannot be created (the scrape then degrades to
|
||||||
|
outcode-level postcodes rather than failing)."""
|
||||||
|
if detail_cap <= 0:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return page.context.new_page()
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning(
|
||||||
|
"Zoopla detail tab unavailable (%s); using outcode-level postcodes",
|
||||||
|
_exception_detail(exc),
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _wall_clock_timeout(seconds: int, label: str):
|
def _wall_clock_timeout(seconds: int, label: str):
|
||||||
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
||||||
|
|
@ -438,6 +472,50 @@ def _close_zoopla_browser(browser, label: str) -> None:
|
||||||
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_zoopla_flaresolverr(
|
||||||
|
outcodes: list[str],
|
||||||
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
|
results: dict[str, list[dict]],
|
||||||
|
errors: list[str],
|
||||||
|
max_properties_per_source: int | None,
|
||||||
|
) -> None:
|
||||||
|
"""Scrape Zoopla via the FlareSolverr sidecar (no browser/VNC)."""
|
||||||
|
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
||||||
|
from zoopla_flaresolverr import search_outcode as fs_search_outcode
|
||||||
|
|
||||||
|
try:
|
||||||
|
session = FlareSolverrSession(session="zoopla")
|
||||||
|
session.__enter__()
|
||||||
|
except FlareSolverrError as exc:
|
||||||
|
errors.append(f"zoopla: FlareSolverr unavailable: {exc}")
|
||||||
|
log.warning("Zoopla skipped: FlareSolverr unavailable: %s", exc)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
for outcode in outcodes:
|
||||||
|
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
|
||||||
|
if remaining == 0:
|
||||||
|
log.info("Zoopla cap reached")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
props, _ = fs_search_outcode(
|
||||||
|
outcode,
|
||||||
|
pc_index,
|
||||||
|
pc_coords,
|
||||||
|
session,
|
||||||
|
max_properties=remaining,
|
||||||
|
detail_cap=ZOOPLA_MAX_DETAILS_PER_OUTCODE,
|
||||||
|
)
|
||||||
|
added = _store_properties(results, "zoopla", props, max_properties_per_source)
|
||||||
|
log.info("Zoopla %s: +%d", outcode, added)
|
||||||
|
except Exception as exc: # noqa: BLE001 - one outcode must not kill the run
|
||||||
|
_record_error(errors, "zoopla", outcode, exc)
|
||||||
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
|
finally:
|
||||||
|
session.__exit__(None, None, None)
|
||||||
|
|
||||||
|
|
||||||
def _scrape_zoopla(
|
def _scrape_zoopla(
|
||||||
outcodes: list[str],
|
outcodes: list[str],
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
|
@ -446,6 +524,12 @@ def _scrape_zoopla(
|
||||||
errors: list[str],
|
errors: list[str],
|
||||||
max_properties_per_source: int | None,
|
max_properties_per_source: int | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if ZOOPLA_FETCHER == "flaresolverr":
|
||||||
|
_scrape_zoopla_flaresolverr(
|
||||||
|
outcodes, pc_index, pc_coords, results, errors, max_properties_per_source
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
browser, page = _launch_zoopla_with_retries()
|
browser, page = _launch_zoopla_with_retries()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|
@ -454,6 +538,12 @@ def _scrape_zoopla(
|
||||||
return
|
return
|
||||||
|
|
||||||
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
||||||
|
detail_cap = _zoopla_detail_cap()
|
||||||
|
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||||
|
# Spend at most a fraction of each outcode's budget on detail fetches so the
|
||||||
|
# SIGALRM guard never trips mid-outcode and discards already-collected
|
||||||
|
# search listings; the rest is left for search pagination and transform.
|
||||||
|
detail_budget_seconds = max(10.0, outcode_timeout * ZOOPLA_DETAIL_BUDGET_FRACTION)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for outcode in outcodes:
|
for outcode in outcodes:
|
||||||
|
|
@ -470,6 +560,9 @@ def _scrape_zoopla(
|
||||||
pc_index,
|
pc_index,
|
||||||
pc_coords,
|
pc_coords,
|
||||||
max_properties=None,
|
max_properties=None,
|
||||||
|
detail_page=detail_page,
|
||||||
|
detail_cap=detail_cap,
|
||||||
|
detail_budget_seconds=detail_budget_seconds,
|
||||||
)
|
)
|
||||||
added = _store_properties(
|
added = _store_properties(
|
||||||
results,
|
results,
|
||||||
|
|
@ -496,6 +589,8 @@ def _scrape_zoopla(
|
||||||
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
||||||
try:
|
try:
|
||||||
browser, page = _launch_zoopla_with_retries()
|
browser, page = _launch_zoopla_with_retries()
|
||||||
|
# The old context (and its detail tab) is gone; reopen one.
|
||||||
|
detail_page = _open_zoopla_detail_tab(page, detail_cap)
|
||||||
log.info("Zoopla %s retrying with fresh browser", outcode)
|
log.info("Zoopla %s retrying with fresh browser", outcode)
|
||||||
except Exception as relaunch_exc:
|
except Exception as relaunch_exc:
|
||||||
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
||||||
|
|
@ -503,6 +598,11 @@ def _scrape_zoopla(
|
||||||
|
|
||||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||||
finally:
|
finally:
|
||||||
|
if detail_page is not None:
|
||||||
|
try:
|
||||||
|
detail_page.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
_close_zoopla_browser(browser, "zoopla final")
|
_close_zoopla_browser(browser, "zoopla final")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -126,6 +126,14 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
||||||
"Address per Property Register": [
|
"Address per Property Register": [
|
||||||
p["Address per Property Register"] for p in properties
|
p["Address per Property Register"] for p in properties
|
||||||
],
|
],
|
||||||
|
# UPRN (when the scraper recovered it) keys an exact listing->EPC
|
||||||
|
# join; Property number or name is the house identifier for the
|
||||||
|
# Price-Paid address join. Both are None for sources/listings without
|
||||||
|
# a detail-page fetch.
|
||||||
|
"UPRN": [p.get("UPRN") for p in properties],
|
||||||
|
"Property number or name": [
|
||||||
|
p.get("Property number or name") for p in properties
|
||||||
|
],
|
||||||
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
|
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
|
||||||
"Property type": [p["Property type"] for p in properties],
|
"Property type": [p["Property type"] for p in properties],
|
||||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||||
|
|
@ -149,6 +157,8 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
||||||
"Inferred postcode": pl.Utf8,
|
"Inferred postcode": pl.Utf8,
|
||||||
"Listing raw address": pl.Utf8,
|
"Listing raw address": pl.Utf8,
|
||||||
"Address per Property Register": pl.Utf8,
|
"Address per Property Register": pl.Utf8,
|
||||||
|
"UPRN": pl.Utf8,
|
||||||
|
"Property number or name": pl.Utf8,
|
||||||
"Leasehold/Freehold": pl.Utf8,
|
"Leasehold/Freehold": pl.Utf8,
|
||||||
"Property type": pl.Utf8,
|
"Property type": pl.Utf8,
|
||||||
"Property sub-type": pl.Utf8,
|
"Property sub-type": pl.Utf8,
|
||||||
|
|
|
||||||
206
finder/test_onthemarket.py
Normal file
206
finder/test_onthemarket.py
Normal file
|
|
@ -0,0 +1,206 @@
|
||||||
|
"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
|
||||||
|
|
||||||
|
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
|
||||||
|
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
|
||||||
|
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
|
||||||
|
the property's own postcode lives in the analytics dataLayer
|
||||||
|
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
|
||||||
|
postcode sits separately under `…property.agent.postcode` — the trap we must not
|
||||||
|
fall into.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import onthemarket
|
||||||
|
from onthemarket import parse_detail_postcode, transform_property
|
||||||
|
|
||||||
|
|
||||||
|
class _StubIndex:
|
||||||
|
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
|
||||||
|
|
||||||
|
def __init__(self, postcode: str | None):
|
||||||
|
self._postcode = postcode
|
||||||
|
|
||||||
|
def nearest(self, lat: float, lng: float) -> str | None:
|
||||||
|
return self._postcode
|
||||||
|
|
||||||
|
|
||||||
|
def _detail_html(
|
||||||
|
*,
|
||||||
|
property_id: int = 19522441,
|
||||||
|
datalayer_postcode: str = "SE5 9AA",
|
||||||
|
agent_postcode: str = "SE5 8RS",
|
||||||
|
) -> str:
|
||||||
|
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
|
||||||
|
next_data = {
|
||||||
|
"props": {
|
||||||
|
"initialReduxState": {
|
||||||
|
"metadata": {
|
||||||
|
"dataLayer": {
|
||||||
|
"page-type": "details-section",
|
||||||
|
"property-type": "homes",
|
||||||
|
# The property's own unit postcode.
|
||||||
|
"postcode": datalayer_postcode,
|
||||||
|
"property-id": property_id,
|
||||||
|
"price": "275,000",
|
||||||
|
"addressline_2": "Padfield Road",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"property": {
|
||||||
|
"displayAddress": "Padfield Road, London, SE5",
|
||||||
|
"location": {"lon": -0.100233, "lat": 51.466129},
|
||||||
|
# The agent block carries the AGENT'S office postcode — the
|
||||||
|
# trap. parse_detail_postcode must not return this.
|
||||||
|
"agent": {
|
||||||
|
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
|
||||||
|
"postcode": agent_postcode,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
payload = json.dumps(next_data)
|
||||||
|
return (
|
||||||
|
"<html><body>"
|
||||||
|
'<script id="__NEXT_DATA__" type="application/json">'
|
||||||
|
f"{payload}"
|
||||||
|
"</script></body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse_detail_postcode
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_returns_property_postcode_not_agent():
|
||||||
|
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
|
||||||
|
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_normalizes_spacing():
|
||||||
|
html = _detail_html(datalayer_postcode="se59aa")
|
||||||
|
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_ignores_mismatched_property_id():
|
||||||
|
# dataLayer postcode belongs to property 19522441; asking for a different
|
||||||
|
# listing id must refuse to return it.
|
||||||
|
html = _detail_html(property_id=19522441)
|
||||||
|
assert parse_detail_postcode(html, "99999999") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_accepts_when_no_listing_id_given():
|
||||||
|
html = _detail_html(datalayer_postcode="SE5 9AA")
|
||||||
|
assert parse_detail_postcode(html, None) == "SE5 9AA"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_handles_missing_postcode():
|
||||||
|
html = _detail_html(datalayer_postcode="")
|
||||||
|
assert parse_detail_postcode(html, "19522441") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_handles_no_next_data():
|
||||||
|
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_handles_empty_html():
|
||||||
|
assert parse_detail_postcode("", "1") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_handles_malformed_json():
|
||||||
|
html = (
|
||||||
|
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
|
||||||
|
)
|
||||||
|
assert parse_detail_postcode(html, "1") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_handles_missing_datalayer():
|
||||||
|
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
|
||||||
|
html = (
|
||||||
|
'<script id="__NEXT_DATA__" type="application/json">'
|
||||||
|
f"{json.dumps(next_data)}</script>"
|
||||||
|
)
|
||||||
|
assert parse_detail_postcode(html, "1") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# transform_property — detail postcode wiring + trust rule
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_RAW_LISTING = {
|
||||||
|
"id": "19522441",
|
||||||
|
"address": "Padfield Road, London, SE5",
|
||||||
|
"location": {"lon": -0.100233, "lat": 51.466129},
|
||||||
|
"bedrooms": 2,
|
||||||
|
"bathrooms": 1,
|
||||||
|
"price": "£275,000",
|
||||||
|
"humanised-property-type": "Apartment",
|
||||||
|
"features": ["Tenure: Leasehold (99 years remaining)"],
|
||||||
|
"details-url": "/details/19522441/",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_uses_trusted_detail_postcode():
|
||||||
|
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
|
||||||
|
# the (more precise) detail postcode and label it detail_address.
|
||||||
|
index = _StubIndex("SE5 1AA")
|
||||||
|
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
|
||||||
|
assert out is not None
|
||||||
|
assert out["Postcode"] == "SE5 9AA"
|
||||||
|
assert out["Postcode source"] == "detail_address"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
|
||||||
|
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
|
||||||
|
# outcode -> reject the detail postcode, fall back to coordinate logic.
|
||||||
|
index = _StubIndex("SE5 1AA")
|
||||||
|
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
|
||||||
|
assert out is not None
|
||||||
|
assert out["Postcode"] == "SE5 1AA"
|
||||||
|
assert out["Postcode source"] == "coordinates"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_without_detail_postcode_uses_coordinates():
|
||||||
|
index = _StubIndex("SE5 1AA")
|
||||||
|
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
|
||||||
|
assert out is not None
|
||||||
|
assert out["Postcode"] == "SE5 1AA"
|
||||||
|
assert out["Postcode source"] == "coordinates"
|
||||||
|
# No UPRN / house number is recoverable from OnTheMarket.
|
||||||
|
assert out["UPRN"] is None
|
||||||
|
assert out["Property number or name"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_detail_postcode_via_search_address_outcode():
|
||||||
|
# When the card address already carries a full postcode that agrees with the
|
||||||
|
# coordinates, the existing "address" source still wins absent a detail
|
||||||
|
# postcode — detail recovery never regresses that path.
|
||||||
|
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
|
||||||
|
index = _StubIndex("SE5 1AA")
|
||||||
|
out = transform_property(raw, index, detail_postcode=None)
|
||||||
|
assert out["Postcode"] == "SE5 1AA"
|
||||||
|
assert out["Postcode source"] == "address"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _fetch_detail_postcode caching (no real network)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_detail_postcode_is_cached(monkeypatch):
|
||||||
|
onthemarket._detail_postcode_cache.clear()
|
||||||
|
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
|
||||||
|
|
||||||
|
def _boom(*args, **kwargs): # pragma: no cover - must never be called
|
||||||
|
raise AssertionError("network was hit despite a cached value")
|
||||||
|
|
||||||
|
# Any httpx use would explode; the cache hit must short-circuit first.
|
||||||
|
result = onthemarket._fetch_detail_postcode(
|
||||||
|
client=type("C", (), {"get": _boom})(),
|
||||||
|
details_url="/details/19522441/",
|
||||||
|
listing_id="19522441",
|
||||||
|
)
|
||||||
|
assert result == "SE5 9AA"
|
||||||
|
onthemarket._detail_postcode_cache.clear()
|
||||||
113
finder/test_rightmove.py
Normal file
113
finder/test_rightmove.py
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
"""Tests for the Rightmove detail-page postcode extractor.
|
||||||
|
|
||||||
|
The search API only returns an outcode-level ``displayAddress``; the property's
|
||||||
|
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
|
||||||
|
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
|
||||||
|
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
|
||||||
|
graph whose ``data`` field is a JSON STRING of a flat array where every integer
|
||||||
|
inside a container is an index reference into that same array.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from rightmove import _extract_page_model_literal, parse_detail_postcode
|
||||||
|
|
||||||
|
|
||||||
|
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
|
||||||
|
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
|
||||||
|
|
||||||
|
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
|
||||||
|
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
|
||||||
|
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
|
||||||
|
return (
|
||||||
|
"<html><head></head><body>\n"
|
||||||
|
"<script>\n"
|
||||||
|
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
|
||||||
|
"</script>\n"
|
||||||
|
"</body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# A faithful slice of a real listing: root -> propertyData -> address, with a
|
||||||
|
# decoy nearestStations array (which carries NO postcodes on the live page) to
|
||||||
|
# prove the parser anchors on the property's own address, not a nearby POI.
|
||||||
|
_FLAT_SW9 = [
|
||||||
|
{"propertyData": 1}, # 0: root
|
||||||
|
{
|
||||||
|
"id": "89089584",
|
||||||
|
"address": 2,
|
||||||
|
"location": 4,
|
||||||
|
"nearestStations": 6,
|
||||||
|
}, # 1: propertyData
|
||||||
|
{
|
||||||
|
"displayAddress": "Caldwell Street, Stockwell",
|
||||||
|
"countryCode": "GB",
|
||||||
|
"ukCountry": "England",
|
||||||
|
"outcode": "SW9",
|
||||||
|
"incode": "0HD",
|
||||||
|
}, # 2: address
|
||||||
|
None, # 3: filler
|
||||||
|
{
|
||||||
|
"latitude": 51.477238,
|
||||||
|
"longitude": -0.116819,
|
||||||
|
"pinType": "ACCURATE_POINT",
|
||||||
|
}, # 4: location
|
||||||
|
None, # 5: filler
|
||||||
|
[7, 8], # 6: nearestStations (references)
|
||||||
|
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
|
||||||
|
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parses_full_postcode_from_outcode_and_incode() -> None:
|
||||||
|
html = _page_model_html(_FLAT_SW9)
|
||||||
|
assert parse_detail_postcode(html) == "SW9 0HD"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
|
||||||
|
# The literal must include the whole nested object, not stop at the first
|
||||||
|
# closing brace inside the escaped data string.
|
||||||
|
html = _page_model_html(_FLAT_SW9)
|
||||||
|
literal = _extract_page_model_literal(html)
|
||||||
|
assert literal is not None
|
||||||
|
assert literal.startswith("{") and literal.endswith("}")
|
||||||
|
# Round-trips back to a dict with the expected top-level keys.
|
||||||
|
assert set(json.loads(literal)) == {"data", "encoding"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalises_unspaced_incode() -> None:
|
||||||
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||||
|
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
|
||||||
|
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_returns_none_when_address_missing() -> None:
|
||||||
|
# The location wrapper can be empty/absent on some listings; the caller then
|
||||||
|
# keeps the coordinate fallback, so we must return None (not raise).
|
||||||
|
flat = [
|
||||||
|
{"propertyData": 1},
|
||||||
|
{"id": "1", "location": 2},
|
||||||
|
{"latitude": 51.5, "longitude": -0.1},
|
||||||
|
]
|
||||||
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_returns_none_when_incode_blank() -> None:
|
||||||
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||||
|
flat[2] = {**_FLAT_SW9[2], "incode": ""}
|
||||||
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_returns_none_for_non_postcode_pair() -> None:
|
||||||
|
# A structurally-invalid outcode/incode pair is rejected by the validator.
|
||||||
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
||||||
|
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
|
||||||
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_returns_none_without_page_model() -> None:
|
||||||
|
assert parse_detail_postcode("") is None
|
||||||
|
assert parse_detail_postcode("<html><body>no model</body></html>") is None
|
||||||
|
# Malformed JSON in the data field degrades gracefully.
|
||||||
|
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
|
||||||
|
assert parse_detail_postcode(broken) is None
|
||||||
|
|
@ -1,13 +1,19 @@
|
||||||
from transform import (
|
from transform import (
|
||||||
|
build_register_address,
|
||||||
clean_listing_address,
|
clean_listing_address,
|
||||||
extract_full_postcode,
|
extract_full_postcode,
|
||||||
|
extract_outcode,
|
||||||
|
resolve_listing_postcode,
|
||||||
transform_property,
|
transform_property,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class StubPostcodeIndex:
|
class StubPostcodeIndex:
|
||||||
|
def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
|
||||||
|
self._postcode = postcode
|
||||||
|
|
||||||
def nearest(self, lat: float, lng: float) -> str:
|
def nearest(self, lat: float, lng: float) -> str:
|
||||||
return "SW1A 9ZZ"
|
return self._postcode
|
||||||
|
|
||||||
|
|
||||||
def test_extract_full_postcode_normalizes_spacing() -> None:
|
def test_extract_full_postcode_normalizes_spacing() -> None:
|
||||||
|
|
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
|
||||||
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
|
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_register_address_prepends_house_number_or_name() -> None:
|
||||||
|
# House number/name prepended, with the trailing outcode/postcode stripped.
|
||||||
|
assert (
|
||||||
|
build_register_address("South Street, Bromley BR1", "12")
|
||||||
|
== "12, South Street, Bromley"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
build_register_address("Riverside, Martham NR29", "Martham Mill")
|
||||||
|
== "Martham Mill, Riverside, Martham"
|
||||||
|
)
|
||||||
|
# No number/name -> identical to the plain cleaned address.
|
||||||
|
assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
|
||||||
|
# Already starts with the number/name -> no duplication.
|
||||||
|
assert (
|
||||||
|
build_register_address("12 South Street, Bromley", "12")
|
||||||
|
== "12 South Street, Bromley"
|
||||||
|
)
|
||||||
|
# Empty/whitespace number/name is ignored.
|
||||||
|
assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_outcode() -> None:
|
||||||
|
assert extract_outcode("SW1A 2AA") == "SW1A"
|
||||||
|
assert extract_outcode("n4 2ha") == "N4"
|
||||||
|
assert extract_outcode("SW1A2AA") == "SW1A"
|
||||||
|
assert extract_outcode(None) is None
|
||||||
|
assert extract_outcode("") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_listing_postcode() -> None:
|
||||||
|
# Outcode matches -> trust the more precise extracted postcode.
|
||||||
|
assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
|
||||||
|
# Outcode mismatch -> fall back to the spatially-correct inferred postcode.
|
||||||
|
assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||||
|
# Well-formed but fabricated postcode in a different outcode is rejected.
|
||||||
|
assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||||
|
# No extracted postcode -> inferred is authoritative.
|
||||||
|
assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||||
|
|
||||||
|
|
||||||
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
||||||
prop = {
|
prop = {
|
||||||
"id": "123",
|
"id": "123",
|
||||||
|
|
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
||||||
assert result["Inferred postcode"] == "SW1A 9ZZ"
|
assert result["Inferred postcode"] == "SW1A 9ZZ"
|
||||||
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
|
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
|
||||||
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
|
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
|
||||||
|
|
||||||
|
|
||||||
|
def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
|
||||||
|
prop = {
|
||||||
|
"id": "124",
|
||||||
|
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||||
|
"price": {"amount": 750000, "displayPrices": []},
|
||||||
|
"propertySubType": "Terraced",
|
||||||
|
"bedrooms": 3,
|
||||||
|
"bathrooms": 1,
|
||||||
|
"keyFeatures": [],
|
||||||
|
"propertyUrl": "/properties/124",
|
||||||
|
# Address postcode is in a different outcode than the coordinate-nearest one.
|
||||||
|
"displayAddress": "10 Downing Street, E14 9SS",
|
||||||
|
}
|
||||||
|
|
||||||
|
result = transform_property(prop, "SW1A", StubPostcodeIndex())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
# The spatially-correct inferred postcode wins over the mismatching extracted one.
|
||||||
|
assert result["Postcode"] == "SW1A 9ZZ"
|
||||||
|
assert result["Postcode source"] == "coordinates"
|
||||||
|
assert result["Extracted postcode"] == "E14 9SS"
|
||||||
|
|
||||||
|
|
||||||
|
def _rightmove_prop() -> dict:
|
||||||
|
return {
|
||||||
|
"id": "200",
|
||||||
|
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||||
|
"price": {"amount": 750000, "displayPrices": []},
|
||||||
|
"propertySubType": "Terraced",
|
||||||
|
"bedrooms": 3,
|
||||||
|
"bathrooms": 1,
|
||||||
|
"keyFeatures": [],
|
||||||
|
"propertyUrl": "/properties/200",
|
||||||
|
# Search API only ever exposes the outcode in the display address.
|
||||||
|
"displayAddress": "Caldwell Street, Stockwell, SW9",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_rightmove_transform_prefers_detail_postcode() -> None:
|
||||||
|
# The detail page's true full postcode (same outcode as the location) is
|
||||||
|
# preferred over the coordinate-nearest guess.
|
||||||
|
result = transform_property(
|
||||||
|
_rightmove_prop(),
|
||||||
|
"SW9",
|
||||||
|
StubPostcodeIndex("SW9 7AA"),
|
||||||
|
detail_postcode="SW9 0HD",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["Postcode"] == "SW9 0HD"
|
||||||
|
assert result["Postcode source"] == "detail_address"
|
||||||
|
# The coordinate inference is still surfaced separately.
|
||||||
|
assert result["Inferred postcode"] == "SW9 7AA"
|
||||||
|
|
||||||
|
|
||||||
|
def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
|
||||||
|
# A detail postcode whose outcode disagrees with the location must not
|
||||||
|
# relocate the listing; the coordinate postcode wins instead.
|
||||||
|
result = transform_property(
|
||||||
|
_rightmove_prop(),
|
||||||
|
"SW9",
|
||||||
|
StubPostcodeIndex("SW9 7AA"),
|
||||||
|
detail_postcode="E14 9SS",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["Postcode"] == "SW9 7AA"
|
||||||
|
assert result["Postcode source"] == "coordinates"
|
||||||
|
|
||||||
|
|
||||||
|
def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
|
||||||
|
# No detail postcode -> behaviour is unchanged (coordinate-nearest).
|
||||||
|
result = transform_property(
|
||||||
|
_rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["Postcode"] == "SW9 7AA"
|
||||||
|
assert result["Postcode source"] == "coordinates"
|
||||||
|
|
|
||||||
288
finder/test_zoopla.py
Normal file
288
finder/test_zoopla.py
Normal file
|
|
@ -0,0 +1,288 @@
|
||||||
|
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
|
||||||
|
|
||||||
|
|
||||||
|
def test_detail_cache_key_uses_listing_id() -> None:
|
||||||
|
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
|
||||||
|
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
|
||||||
|
# No id in the URL -> fall back to the URL itself as the key.
|
||||||
|
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
|
||||||
|
|
||||||
|
|
||||||
|
class StubPostcodeIndex:
|
||||||
|
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
|
||||||
|
|
||||||
|
def __init__(self, postcode: str = "BR1 2AB") -> None:
|
||||||
|
self._postcode = postcode
|
||||||
|
|
||||||
|
def nearest(self, lat: float, lng: float) -> str:
|
||||||
|
return self._postcode
|
||||||
|
|
||||||
|
|
||||||
|
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
|
||||||
|
# verified detail-page snippet (well inside the England bounds check).
|
||||||
|
PC_COORDS = {
|
||||||
|
"BR1 2AB": (51.40, 0.01),
|
||||||
|
"SW1A 1AA": (51.50, -0.14),
|
||||||
|
"NR29 4RG": (52.716014, 1.614495),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
|
||||||
|
# a self.__next_f flight chunk in page.content().
|
||||||
|
_LOCATION_ESCAPED = (
|
||||||
|
'<script>self.__next_f.push([1,"...'
|
||||||
|
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
|
||||||
|
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
|
||||||
|
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
|
||||||
|
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
|
||||||
|
'..."])</script>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_location_object_escaped() -> None:
|
||||||
|
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
|
||||||
|
assert geo == {
|
||||||
|
"lat": 52.716014,
|
||||||
|
"lng": 1.614495,
|
||||||
|
"postcode": "NR29 4RG",
|
||||||
|
"outcode": "NR29",
|
||||||
|
"source": "detail_location",
|
||||||
|
"uprn": "10023461458",
|
||||||
|
"number_or_name": "Martham Mill",
|
||||||
|
# No `address` twin in this snippet, so there is no full street address.
|
||||||
|
"full_address": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_location_object_unescaped() -> None:
|
||||||
|
html = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(html)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["source"] == "detail_location"
|
||||||
|
assert geo["postcode"] == "NR29 4RG"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_address_twin() -> None:
|
||||||
|
html = (
|
||||||
|
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||||
|
'"latitude":52.716014,"longitude":1.614495,'
|
||||||
|
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(html)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["source"] == "detail_address_obj"
|
||||||
|
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
|
||||||
|
assert geo["uprn"] == "10023461458"
|
||||||
|
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
|
||||||
|
# Real detail pages carry both wrappers: the `location` object holds the
|
||||||
|
# uprn + house number/name, the `address` twin holds the full street
|
||||||
|
# address. They share a uprn, so the twin's fullAddress is attached.
|
||||||
|
html = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"uprn":"10023461458","postalCode":"NR29 4RG",'
|
||||||
|
'"propertyNumberOrName":"Martham Mill"}'
|
||||||
|
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||||
|
'"latitude":52.716014,"longitude":1.614495,'
|
||||||
|
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(html)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["source"] == "detail_location"
|
||||||
|
assert geo["uprn"] == "10023461458"
|
||||||
|
assert geo["number_or_name"] == "Martham Mill"
|
||||||
|
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
|
||||||
|
# The only `address` twin on the page belongs to a different uprn (a
|
||||||
|
# comparable listing). With a uprn to match on, an unrelated twin is never
|
||||||
|
# borrowed — full_address stays None rather than grabbing the wrong street.
|
||||||
|
html = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||||
|
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
|
||||||
|
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(html)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["uprn"] == "10023461458"
|
||||||
|
assert geo["full_address"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
|
||||||
|
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
|
||||||
|
# by the property's own "location" wrapper. Anchoring on the wrapper means
|
||||||
|
# the POI's coordinates are ignored and the property's are returned.
|
||||||
|
poi = (
|
||||||
|
'"name":"Martham Community Centre","numberOfConnectors":2,'
|
||||||
|
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
|
||||||
|
)
|
||||||
|
prop = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(poi + prop)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["source"] == "detail_location"
|
||||||
|
# The property's coords win, not the community centre's.
|
||||||
|
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
|
||||||
|
assert geo["postcode"] == "NR29 4RG"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
|
||||||
|
# Page embeds two location objects (e.g. a comparable then the property).
|
||||||
|
# With a search outcode, the one in that outcode is preferred; without one,
|
||||||
|
# the first (document order = primary listing) is returned.
|
||||||
|
comparable = (
|
||||||
|
'"location":{"outcode":"EN2",'
|
||||||
|
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
|
||||||
|
'"postalCode":"EN2 6SN"}'
|
||||||
|
)
|
||||||
|
target = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"postalCode":"NR29 4RG"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
|
||||||
|
assert geo is not None and geo["postcode"] == "NR29 4RG"
|
||||||
|
geo_first = parse_detail_geo(comparable + target)
|
||||||
|
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_rejects_out_of_england() -> None:
|
||||||
|
html = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":10.0,"longitude":10.0},'
|
||||||
|
'"uprn":"1","postalCode":"NR29 4RG"}'
|
||||||
|
)
|
||||||
|
assert parse_detail_geo(html) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
|
||||||
|
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
|
||||||
|
# keep the coordinates, drop the untrustworthy postcode.
|
||||||
|
html = (
|
||||||
|
'"location":{"outcode":"NR29",'
|
||||||
|
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||||
|
'"uprn":"1","postalCode":"AB12 3CD"}'
|
||||||
|
)
|
||||||
|
geo = parse_detail_geo(html)
|
||||||
|
assert geo is not None
|
||||||
|
assert geo["lat"] == 52.716014
|
||||||
|
assert geo["postcode"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_detail_geo_returns_none_for_garbage() -> None:
|
||||||
|
assert parse_detail_geo("<html><body>no data here</body></html>") is None
|
||||||
|
assert parse_detail_geo("") is None
|
||||||
|
# Coordinates that are not inside a property location/address wrapper (e.g.
|
||||||
|
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
|
||||||
|
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
|
||||||
|
|
||||||
|
|
||||||
|
def _raw(**overrides) -> dict:
|
||||||
|
raw = {
|
||||||
|
"id": "123",
|
||||||
|
"url": "/for-sale/details/123/",
|
||||||
|
"address": "South Street, Bromley BR1",
|
||||||
|
"price": 500000,
|
||||||
|
"beds": 2,
|
||||||
|
"baths": 1,
|
||||||
|
"property_type": "Flat",
|
||||||
|
}
|
||||||
|
raw.update(overrides)
|
||||||
|
return raw
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
|
||||||
|
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
|
||||||
|
result = transform_property(
|
||||||
|
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
|
||||||
|
assert result["Postcode"] == "BR1 3CD"
|
||||||
|
assert result["Postcode source"] == "detail_address"
|
||||||
|
assert result["Inferred postcode"] == "BR1 2AB"
|
||||||
|
assert (result["lat"], result["lon"]) == (51.401, 0.011)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
|
||||||
|
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
|
||||||
|
result = transform_property(
|
||||||
|
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
# Mismatching detail postcode is rejected in favour of the spatial value.
|
||||||
|
assert result["Postcode"] == "BR1 2AB"
|
||||||
|
assert result["Postcode source"] == "detail_coordinates"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
|
||||||
|
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
|
||||||
|
result = transform_property(
|
||||||
|
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
assert result["Postcode"] == "SW1A 1AA"
|
||||||
|
assert result["Postcode source"] == "detail_address"
|
||||||
|
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
|
||||||
|
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
|
||||||
|
result = transform_property(
|
||||||
|
_raw(address="A street with no postcode"),
|
||||||
|
StubPostcodeIndex(),
|
||||||
|
PC_COORDS,
|
||||||
|
search_outcode="BR1",
|
||||||
|
detail=None,
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
assert result["Postcode"] == "BR1 2AB"
|
||||||
|
assert result["Postcode source"] == "search_outcode"
|
||||||
|
# No detail page -> no UPRN / house number recovered.
|
||||||
|
assert result["UPRN"] is None
|
||||||
|
assert result["Property number or name"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
|
||||||
|
detail = {
|
||||||
|
"lat": 51.401,
|
||||||
|
"lng": 0.011,
|
||||||
|
"postcode": "BR1 3CD",
|
||||||
|
"outcode": "BR1",
|
||||||
|
"uprn": "100023461458",
|
||||||
|
"number_or_name": "12",
|
||||||
|
"full_address": "South Street, Bromley BR1",
|
||||||
|
}
|
||||||
|
result = transform_property(
|
||||||
|
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
assert result["UPRN"] == "100023461458"
|
||||||
|
assert result["Property number or name"] == "12"
|
||||||
|
# The detail full address replaces the outcode-level card address, and the
|
||||||
|
# house number is prepended for a near-exact Property Register match.
|
||||||
|
assert result["Listing raw address"] == "South Street, Bromley BR1"
|
||||||
|
assert result["Address per Property Register"] == "12, South Street, Bromley"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_ignores_out_of_england_detail_coords() -> None:
|
||||||
|
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
|
||||||
|
result = transform_property(
|
||||||
|
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||||
|
)
|
||||||
|
assert result is not None
|
||||||
|
# Bad detail coords are discarded; falls through to the address outcode (BR1).
|
||||||
|
assert result["Postcode source"] == "address_outcode"
|
||||||
|
assert 49 <= result["lat"] <= 56
|
||||||
|
|
@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
|
||||||
return normalize_postcode(match.group(1))
|
return normalize_postcode(match.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_outcode(postcode: str | None) -> str | None:
|
||||||
|
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
|
||||||
|
if not postcode:
|
||||||
|
return None
|
||||||
|
normalized = normalize_postcode(postcode)
|
||||||
|
outcode = normalized.split(" ", 1)[0]
|
||||||
|
return outcode or None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_listing_postcode(
|
||||||
|
extracted_postcode: str | None, inferred_postcode: str
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""Pick the authoritative postcode for a listing, returning (postcode, source).
|
||||||
|
|
||||||
|
The address-extracted postcode is more precise than the coordinate-nearest one,
|
||||||
|
but it is only trustworthy when it agrees with the location: a stale, mistyped or
|
||||||
|
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
|
||||||
|
override the spatially-correct value. Since the spatial index only supports
|
||||||
|
nearest-lookup, accept the extracted postcode only when its outcode matches the
|
||||||
|
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
|
||||||
|
inferred one, which is always a real, plausibly-correct postcode.
|
||||||
|
"""
|
||||||
|
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
|
||||||
|
inferred_postcode
|
||||||
|
):
|
||||||
|
return extracted_postcode, "address"
|
||||||
|
if extracted_postcode:
|
||||||
|
log.debug(
|
||||||
|
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
|
||||||
|
extracted_postcode,
|
||||||
|
inferred_postcode,
|
||||||
|
)
|
||||||
|
return inferred_postcode, "coordinates"
|
||||||
|
|
||||||
|
|
||||||
def clean_listing_address(address: str | None) -> str:
|
def clean_listing_address(address: str | None) -> str:
|
||||||
"""Remove postcode/outcode suffixes from listing display addresses.
|
"""Remove postcode/outcode suffixes from listing display addresses.
|
||||||
|
|
||||||
|
|
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
|
||||||
return cleaned.strip(" ,")
|
return cleaned.strip(" ,")
|
||||||
|
|
||||||
|
|
||||||
|
def build_register_address(
|
||||||
|
raw_address: str | None, number_or_name: str | None = None
|
||||||
|
) -> str:
|
||||||
|
"""Build a Property Register-style address, prepending the house number/name.
|
||||||
|
|
||||||
|
Listing display addresses are usually street-level ("South Street, Bromley")
|
||||||
|
because the portals hide the exact unit. When a scraper can recover the
|
||||||
|
property's own number or name (e.g. Zoopla detail pages expose
|
||||||
|
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
|
||||||
|
carries the house identifier that the EPC/Price-Paid register addresses also
|
||||||
|
use — turning a fuzzy street match into a near-exact one. Falls back to the
|
||||||
|
plain cleaned address when no number/name is available.
|
||||||
|
"""
|
||||||
|
cleaned = clean_listing_address(raw_address)
|
||||||
|
if not number_or_name:
|
||||||
|
return cleaned
|
||||||
|
number_or_name = number_or_name.strip()
|
||||||
|
if not number_or_name:
|
||||||
|
return cleaned
|
||||||
|
# Avoid duplicating a number/name the display address already starts with.
|
||||||
|
if cleaned.lower().startswith(number_or_name.lower()):
|
||||||
|
return cleaned
|
||||||
|
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
|
||||||
|
|
||||||
|
|
||||||
def transform_property(
|
def transform_property(
|
||||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
prop: dict,
|
||||||
|
outcode: str,
|
||||||
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
detail_postcode: str | None = None,
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
"""Transform a raw Rightmove property dict into our output schema."""
|
"""Transform a raw Rightmove property dict into our output schema.
|
||||||
|
|
||||||
|
``detail_postcode`` is the property's TRUE full postcode recovered from its
|
||||||
|
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
|
||||||
|
only exposes the outcode-level ``displayAddress``. When supplied and it
|
||||||
|
agrees with the coordinate-nearest postcode's outcode, it is preferred over
|
||||||
|
the coordinate guess and recorded with source ``"detail_address"``. A
|
||||||
|
detail postcode whose outcode disagrees with the location is discarded in
|
||||||
|
favour of the spatially-correct coordinate postcode, so a stale or wrong
|
||||||
|
detail value can never silently relocate a listing.
|
||||||
|
"""
|
||||||
loc = prop.get("location")
|
loc = prop.get("location")
|
||||||
if not loc:
|
if not loc:
|
||||||
return None
|
return None
|
||||||
|
|
@ -268,8 +341,25 @@ def transform_property(
|
||||||
return None
|
return None
|
||||||
raw_address = prop.get("displayAddress", "") or ""
|
raw_address = prop.get("displayAddress", "") or ""
|
||||||
extracted_postcode = extract_full_postcode(raw_address)
|
extracted_postcode = extract_full_postcode(raw_address)
|
||||||
postcode = extracted_postcode or inferred_postcode
|
|
||||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
# Prefer the detail page's true full postcode when it agrees with the
|
||||||
|
# location; otherwise fall back to the (display-address-or-coordinate) logic.
|
||||||
|
detail_full = extract_full_postcode(detail_postcode)
|
||||||
|
if detail_full and extract_outcode(detail_full) == extract_outcode(
|
||||||
|
inferred_postcode
|
||||||
|
):
|
||||||
|
postcode, postcode_source = detail_full, "detail_address"
|
||||||
|
else:
|
||||||
|
if detail_full:
|
||||||
|
log.debug(
|
||||||
|
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
|
||||||
|
"inferred %s)",
|
||||||
|
detail_full,
|
||||||
|
inferred_postcode,
|
||||||
|
)
|
||||||
|
postcode, postcode_source = resolve_listing_postcode(
|
||||||
|
extracted_postcode, inferred_postcode
|
||||||
|
)
|
||||||
|
|
||||||
property_url = prop.get("propertyUrl") or ""
|
property_url = prop.get("propertyUrl") or ""
|
||||||
if not isinstance(property_url, str):
|
if not isinstance(property_url, str):
|
||||||
|
|
@ -291,6 +381,9 @@ def transform_property(
|
||||||
"Inferred postcode": inferred_postcode,
|
"Inferred postcode": inferred_postcode,
|
||||||
"Listing raw address": raw_address,
|
"Listing raw address": raw_address,
|
||||||
"Address per Property Register": clean_listing_address(raw_address),
|
"Address per Property Register": clean_listing_address(raw_address),
|
||||||
|
# Rightmove's displayAddress is street-level; no UPRN/house number.
|
||||||
|
"UPRN": None,
|
||||||
|
"Property number or name": None,
|
||||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||||
"Property type": map_property_type(sub_type),
|
"Property type": map_property_type(sub_type),
|
||||||
"Property sub-type": normalize_sub_type(sub_type),
|
"Property sub-type": normalize_sub_type(sub_type),
|
||||||
|
|
|
||||||
381
finder/zoopla.py
381
finder/zoopla.py
|
|
@ -32,16 +32,24 @@ import httpx
|
||||||
from constants import (
|
from constants import (
|
||||||
DATA_DIR,
|
DATA_DIR,
|
||||||
DELAY_BETWEEN_PAGES,
|
DELAY_BETWEEN_PAGES,
|
||||||
|
GLUETUN_API_KEY,
|
||||||
|
GLUETUN_CONTROL_URL,
|
||||||
|
GLUETUN_MAX_ROTATIONS,
|
||||||
|
GLUETUN_PROXY,
|
||||||
MAX_BEDROOMS,
|
MAX_BEDROOMS,
|
||||||
PROPERTY_TYPE_MAP,
|
PROPERTY_TYPE_MAP,
|
||||||
ZOOPLA_BASE,
|
ZOOPLA_BASE,
|
||||||
|
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS,
|
||||||
)
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import (
|
from transform import (
|
||||||
clean_listing_address,
|
build_register_address,
|
||||||
extract_full_postcode,
|
extract_full_postcode,
|
||||||
|
extract_outcode,
|
||||||
|
fix_coords,
|
||||||
normalize_sub_type,
|
normalize_sub_type,
|
||||||
parse_int_value,
|
parse_int_value,
|
||||||
|
resolve_listing_postcode,
|
||||||
validate_floor_area,
|
validate_floor_area,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -468,27 +476,20 @@ def _challenge_timeout_seconds() -> int:
|
||||||
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
# cookies (bound to the previous IP), then reload and re-check the challenge.
|
||||||
|
|
||||||
|
|
||||||
_GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
|
||||||
|
|
||||||
|
|
||||||
def _gluetun_base_url() -> str:
|
def _gluetun_base_url() -> str:
|
||||||
return os.environ.get("GLUETUN_URL", "http://gluetun:8000").rstrip("/")
|
return GLUETUN_CONTROL_URL.rstrip("/")
|
||||||
|
|
||||||
|
|
||||||
def _gluetun_api_key() -> str | None:
|
def _gluetun_api_key() -> str | None:
|
||||||
return _GLUETUN_API_KEY
|
return GLUETUN_API_KEY
|
||||||
|
|
||||||
|
|
||||||
def _gluetun_max_rotations() -> int:
|
def _gluetun_max_rotations() -> int:
|
||||||
raw = os.environ.get("GLUETUN_MAX_ROTATIONS", "3")
|
return max(GLUETUN_MAX_ROTATIONS, 0)
|
||||||
try:
|
|
||||||
value = int(raw)
|
|
||||||
except ValueError as exc:
|
|
||||||
raise ValueError("GLUETUN_MAX_ROTATIONS must be an integer") from exc
|
|
||||||
return max(value, 0)
|
|
||||||
|
|
||||||
|
|
||||||
def _gluetun_client() -> httpx.Client:
|
def _gluetun_client() -> httpx.Client:
|
||||||
|
# Talks to the control server directly (not through the VPN proxy).
|
||||||
headers = {}
|
headers = {}
|
||||||
api_key = _gluetun_api_key()
|
api_key = _gluetun_api_key()
|
||||||
if api_key:
|
if api_key:
|
||||||
|
|
@ -694,10 +695,19 @@ def launch_browser():
|
||||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||||
_remove_stale_profile_locks(profile_dir)
|
_remove_stale_profile_locks(profile_dir)
|
||||||
|
|
||||||
|
# Route the browser through the Gluetun VPN proxy when configured. (geoip
|
||||||
|
# fingerprint alignment is intentionally not enabled: it needs the optional
|
||||||
|
# camoufox[geoip] extra and would spoof to the VPN exit's country, which
|
||||||
|
# fights the en-GB locale unless the exit is in the UK.)
|
||||||
|
proxy_options: dict = {}
|
||||||
|
if GLUETUN_PROXY:
|
||||||
|
proxy_options = {"proxy": {"server": GLUETUN_PROXY}}
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s, proxy=%s)...",
|
||||||
headless_mode,
|
headless_mode,
|
||||||
profile_dir,
|
profile_dir,
|
||||||
|
GLUETUN_PROXY or "direct",
|
||||||
)
|
)
|
||||||
camoufox = Camoufox(
|
camoufox = Camoufox(
|
||||||
headless=headless_mode,
|
headless=headless_mode,
|
||||||
|
|
@ -705,6 +715,7 @@ def launch_browser():
|
||||||
user_data_dir=str(profile_dir),
|
user_data_dir=str(profile_dir),
|
||||||
locale=["en-GB", "en"],
|
locale=["en-GB", "en"],
|
||||||
enable_cache=True,
|
enable_cache=True,
|
||||||
|
**proxy_options,
|
||||||
)
|
)
|
||||||
raw_browser = camoufox.__enter__()
|
raw_browser = camoufox.__enter__()
|
||||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||||
|
|
@ -926,13 +937,47 @@ def _paginate(
|
||||||
page,
|
page,
|
||||||
total_results: int,
|
total_results: int,
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
|
fetch_detail=None,
|
||||||
|
detail_cap: int = 0,
|
||||||
|
detail_state: dict | None = None,
|
||||||
|
detail_deadline: float | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Extract listings from all pages of search results.
|
"""Extract listings from all pages of search results.
|
||||||
|
|
||||||
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
||||||
next link when present, otherwise advance via the pn=N URL parameter while
|
next link when present, otherwise advance via the pn=N URL parameter while
|
||||||
the advertised result count says more listings remain."""
|
the advertised result count says more listings remain.
|
||||||
|
|
||||||
|
When ``fetch_detail`` is supplied, each listing has its detail page fetched
|
||||||
|
(up to ``detail_cap`` fresh loads per outcode, counted in the shared
|
||||||
|
``detail_state`` dict, and only until ``detail_deadline``) and the parsed
|
||||||
|
geo stored under ``listing['_detail']`` for ``transform_property``. The
|
||||||
|
detail page is the only source of the listing's UPRN, full street address
|
||||||
|
and precise postcode, so it is fetched even when the search card already
|
||||||
|
pins a full postcode. Cached detail results are always attached but cost
|
||||||
|
neither a cap slot nor a delay."""
|
||||||
|
|
||||||
|
def _maybe_fetch(listing: dict) -> None:
|
||||||
|
if fetch_detail is None or detail_state is None:
|
||||||
|
return
|
||||||
|
url = listing.get("url", "")
|
||||||
|
cached = _detail_cache_key(url) in _detail_cache
|
||||||
|
if not cached:
|
||||||
|
# Fresh loads are bounded by the per-outcode cap and the wall-clock
|
||||||
|
# deadline so detail fetching never starves the SIGALRM budget that
|
||||||
|
# also guards the search pagination for this outcode.
|
||||||
|
if detail_state["fetched"] >= detail_cap:
|
||||||
|
return
|
||||||
|
if detail_deadline is not None and time.monotonic() >= detail_deadline:
|
||||||
|
return
|
||||||
|
listing["_detail"] = fetch_detail(url)
|
||||||
|
if not cached:
|
||||||
|
detail_state["fetched"] += 1
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
|
||||||
all_listings = _extract_listings(page)
|
all_listings = _extract_listings(page)
|
||||||
|
for listing in all_listings:
|
||||||
|
_maybe_fetch(listing)
|
||||||
if max_properties is not None and len(all_listings) >= max_properties:
|
if max_properties is not None and len(all_listings) >= max_properties:
|
||||||
return all_listings[:max_properties]
|
return all_listings[:max_properties]
|
||||||
|
|
||||||
|
|
@ -984,6 +1029,7 @@ def _paginate(
|
||||||
if listing["id"] not in seen_ids:
|
if listing["id"] not in seen_ids:
|
||||||
seen_ids.add(listing["id"])
|
seen_ids.add(listing["id"])
|
||||||
all_listings.append(listing)
|
all_listings.append(listing)
|
||||||
|
_maybe_fetch(listing)
|
||||||
new_count += 1
|
new_count += 1
|
||||||
if max_properties is not None and len(all_listings) >= max_properties:
|
if max_properties is not None and len(all_listings) >= max_properties:
|
||||||
return all_listings[:max_properties]
|
return all_listings[:max_properties]
|
||||||
|
|
@ -1053,6 +1099,214 @@ def _extract_outcode(text: str) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Detail-page geocoding
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Zoopla search result cards only expose an outcode-level display address (e.g.
|
||||||
|
# "South Street, Bromley BR1"); the full postcode and precise coordinates exist
|
||||||
|
# only on each listing's detail page (/for-sale/details/{id}/). The detail page
|
||||||
|
# is a Next.js App Router route whose React Server Components flight stream
|
||||||
|
# embeds the property's own location object, e.g.
|
||||||
|
# "location":{"outcode":"NR29","coordinates":{"latitude":52.716,"longitude":1.614},
|
||||||
|
# "uprn":"10023461458","postalCode":"NR29 4RG",...}
|
||||||
|
# plus a twin "address":{"fullAddress":...,"latitude":...,"longitude":...,
|
||||||
|
# "outcode":...,"postcode":...,"uprn":...} feeding the map widgets.
|
||||||
|
# Nearby points of interest (stations, schools, EV chargers) and comparable
|
||||||
|
# listings carry their own "coordinates" too, but never inside the property's
|
||||||
|
# own "location" / "address":{"fullAddress" wrapper — so the wrapper, not a
|
||||||
|
# loose coordinates object, is what we anchor on (see parse_detail_geo).
|
||||||
|
|
||||||
|
# listingId -> parsed detail dict (or None). Failures are cached too, so a
|
||||||
|
# broken listing is not re-fetched within a run (the same listing reappears
|
||||||
|
# across overlapping outcode searches).
|
||||||
|
_detail_cache: dict[str, dict | None] = {}
|
||||||
|
|
||||||
|
_LISTING_ID_RE = re.compile(r"/details/(\d+)/?")
|
||||||
|
|
||||||
|
# The property's own location is carried by a `"location":{...}` wrapper and a
|
||||||
|
# twin `"address":{"fullAddress":...}` widget object. We anchor on those
|
||||||
|
# wrappers (and capture their full object body, which contains exactly one
|
||||||
|
# nested object — `coordinates`) rather than scanning for loose coordinate
|
||||||
|
# objects: nearby points of interest (stations/schools/EV chargers) and
|
||||||
|
# comparable/"similar" listings also embed coordinates, but never inside the
|
||||||
|
# property's own `"location"` / `"address":{"fullAddress"` wrapper, so the
|
||||||
|
# wrapper is the discriminator. Field order and an optional `uprn` are tolerated.
|
||||||
|
_DETAIL_LOCATION_RE = re.compile(r'"location":\{((?:[^{}]|\{[^{}]*\})*)\}')
|
||||||
|
_DETAIL_ADDRESS_RE = re.compile(r'"address":\{"fullAddress":"([^"]*)"((?:[^{}]|\{[^{}]*\})*)\}')
|
||||||
|
_DETAIL_COORDS_IN_BODY_RE = re.compile(
|
||||||
|
r'"coordinates":\{"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)\}'
|
||||||
|
)
|
||||||
|
_DETAIL_LATLNG_IN_BODY_RE = re.compile(
|
||||||
|
r'"latitude":(-?\d+\.\d+),"longitude":(-?\d+\.\d+)'
|
||||||
|
)
|
||||||
|
_DETAIL_OUTCODE_IN_BODY_RE = re.compile(r'"outcode":"([A-Z0-9]+)"')
|
||||||
|
# The location object spells it "postalCode"; the address twin uses "postcode".
|
||||||
|
_DETAIL_POSTCODE_IN_BODY_RE = re.compile(r'"(?:postalCode|postcode)":"([A-Z0-9 ]+)"')
|
||||||
|
# The UPRN (Unique Property Reference Number) appears in both the location and
|
||||||
|
# address objects and is the linchpin for an exact listing->EPC join (EPC open
|
||||||
|
# data is ~99% UPRN-keyed). propertyNumberOrName carries the house number/name
|
||||||
|
# (e.g. "12", "Martham Mill") only in the location object.
|
||||||
|
_DETAIL_UPRN_IN_BODY_RE = re.compile(r'"uprn":"(\d+)"')
|
||||||
|
_DETAIL_NUMBER_OR_NAME_IN_BODY_RE = re.compile(r'"propertyNumberOrName":"([^"]*)"')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_geo(html: str, search_outcode: str | None = None) -> dict | None:
|
||||||
|
"""Extract the property's own coordinates/postcode from a Zoopla detail page.
|
||||||
|
|
||||||
|
Pure and browser-free: the live browser only produces the HTML string
|
||||||
|
(``page.content()``); this does the parsing so it is unit-testable.
|
||||||
|
|
||||||
|
Returns ``{"lat", "lng", "postcode", "outcode", "source", "uprn",
|
||||||
|
"number_or_name", "full_address"}`` (every field except the coordinates may
|
||||||
|
be ``None``) or ``None`` when no property location wrapper is found. The
|
||||||
|
``uprn`` enables an exact listing->EPC join; ``number_or_name`` (house
|
||||||
|
number/name) and ``full_address`` give a register-style address for the
|
||||||
|
Price Paid join.
|
||||||
|
Coordinates are bounds-checked to England and a postcode is kept only when
|
||||||
|
it agrees with its own object's outcode. ``search_outcode``, when given, is
|
||||||
|
used only as a tie-break to pick the right ``location`` object on pages that
|
||||||
|
also embed comparable listings. See module docstring for the data model."""
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# RSC flight strings are embedded as escaped JS string literals, so quotes
|
||||||
|
# and slashes arrive escaped; normalize them so the regexes match.
|
||||||
|
buf = html.replace('\\"', '"').replace("\\u002F", "/").replace("\\/", "/")
|
||||||
|
|
||||||
|
def in_england(lat: float, lng: float) -> tuple[float, float] | None:
|
||||||
|
lat, lng = fix_coords(lat, lng)
|
||||||
|
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||||
|
return lat, lng
|
||||||
|
return None
|
||||||
|
|
||||||
|
def build(body: str, coords, source: str, full_address: str | None = None) -> dict:
|
||||||
|
# outcode and postcode are read from the SAME object body as the coords,
|
||||||
|
# so the postcode is self-consistent; drop it only if it somehow isn't.
|
||||||
|
outcode_match = _DETAIL_OUTCODE_IN_BODY_RE.search(body)
|
||||||
|
outcode = outcode_match.group(1) if outcode_match else None
|
||||||
|
postcode_match = _DETAIL_POSTCODE_IN_BODY_RE.search(body)
|
||||||
|
postcode = extract_full_postcode(postcode_match.group(1)) if postcode_match else None
|
||||||
|
if postcode and outcode and extract_outcode(postcode) != outcode.upper():
|
||||||
|
postcode = None
|
||||||
|
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(body)
|
||||||
|
number_match = _DETAIL_NUMBER_OR_NAME_IN_BODY_RE.search(body)
|
||||||
|
number_or_name = number_match.group(1).strip() if number_match else None
|
||||||
|
return {
|
||||||
|
"lat": coords[0],
|
||||||
|
"lng": coords[1],
|
||||||
|
"postcode": postcode,
|
||||||
|
"outcode": outcode,
|
||||||
|
"source": source,
|
||||||
|
"uprn": uprn_match.group(1) if uprn_match else None,
|
||||||
|
"number_or_name": number_or_name or None,
|
||||||
|
"full_address": full_address,
|
||||||
|
}
|
||||||
|
|
||||||
|
def attach_full_address(result: dict | None) -> dict | None:
|
||||||
|
# The house-numbered street address lives in the `address` map-widget
|
||||||
|
# twin, not the `location` wrapper we anchor coordinates on. Pull it from
|
||||||
|
# the twin that shares this property's uprn; when there is no uprn to
|
||||||
|
# disambiguate, fall back to the first twin (document order = primary
|
||||||
|
# listing), but never guess a twin when a uprn exists and none matches —
|
||||||
|
# that would risk grabbing a comparable listing's address.
|
||||||
|
if result is None or result.get("full_address"):
|
||||||
|
return result
|
||||||
|
target = result.get("uprn")
|
||||||
|
first = None
|
||||||
|
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
||||||
|
full_address = match.group(1) or None
|
||||||
|
if full_address is None:
|
||||||
|
continue
|
||||||
|
if first is None:
|
||||||
|
first = full_address
|
||||||
|
uprn_match = _DETAIL_UPRN_IN_BODY_RE.search(match.group(2))
|
||||||
|
if target and uprn_match and uprn_match.group(1) == target:
|
||||||
|
result["full_address"] = full_address
|
||||||
|
return result
|
||||||
|
if target is None:
|
||||||
|
result["full_address"] = first
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Strategy 1 — the property's own `location` wrapper (authoritative). Take
|
||||||
|
# the first match (the primary listing precedes any comparables in the
|
||||||
|
# flight stream), but prefer one whose outcode matches the searched outcode.
|
||||||
|
first_location = None
|
||||||
|
for match in _DETAIL_LOCATION_RE.finditer(buf):
|
||||||
|
body = match.group(1)
|
||||||
|
coords_match = _DETAIL_COORDS_IN_BODY_RE.search(body)
|
||||||
|
if not coords_match:
|
||||||
|
continue
|
||||||
|
coords = in_england(float(coords_match.group(1)), float(coords_match.group(2)))
|
||||||
|
if not coords:
|
||||||
|
continue
|
||||||
|
candidate = build(body, coords, "detail_location")
|
||||||
|
if first_location is None:
|
||||||
|
first_location = candidate
|
||||||
|
if (
|
||||||
|
search_outcode
|
||||||
|
and candidate["outcode"]
|
||||||
|
and candidate["outcode"].upper() == search_outcode.upper()
|
||||||
|
):
|
||||||
|
return attach_full_address(candidate)
|
||||||
|
if first_location is not None:
|
||||||
|
return attach_full_address(first_location)
|
||||||
|
|
||||||
|
# Strategy 2 — the `address` map-widget twin (same coordinates, backup).
|
||||||
|
for match in _DETAIL_ADDRESS_RE.finditer(buf):
|
||||||
|
full_address = match.group(1) or None
|
||||||
|
body = match.group(2)
|
||||||
|
latlng_match = _DETAIL_LATLNG_IN_BODY_RE.search(body)
|
||||||
|
if not latlng_match:
|
||||||
|
continue
|
||||||
|
coords = in_england(float(latlng_match.group(1)), float(latlng_match.group(2)))
|
||||||
|
if coords:
|
||||||
|
return build(body, coords, "detail_address_obj", full_address=full_address)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _detail_cache_key(listing_url: str) -> str:
|
||||||
|
"""Cache key for a listing detail page — its numeric id when present."""
|
||||||
|
id_match = _LISTING_ID_RE.search(listing_url)
|
||||||
|
return id_match.group(1) if id_match else listing_url
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_listing_detail(
|
||||||
|
detail_page,
|
||||||
|
listing_url: str,
|
||||||
|
search_outcode: str | None = None,
|
||||||
|
) -> dict | None:
|
||||||
|
"""Load a listing detail page and return its parsed geo dict (or None).
|
||||||
|
|
||||||
|
Results (including failures) are cached by listingId. Ordinary navigation
|
||||||
|
and extraction errors are swallowed so the caller can fall back to
|
||||||
|
outcode-level resolution, but TurnstileError is allowed to propagate so the
|
||||||
|
scraper's "Cloudflare ends the run" contract still holds. The goto timeout
|
||||||
|
is kept short so one slow detail page can't eat the per-outcode budget."""
|
||||||
|
cache_key = _detail_cache_key(listing_url)
|
||||||
|
if cache_key in _detail_cache:
|
||||||
|
return _detail_cache[cache_key]
|
||||||
|
|
||||||
|
url = listing_url if listing_url.startswith("http") else ZOOPLA_BASE + listing_url
|
||||||
|
result: dict | None = None
|
||||||
|
try:
|
||||||
|
detail_page.goto(
|
||||||
|
url, wait_until="domcontentloaded", timeout=ZOOPLA_DETAIL_GOTO_TIMEOUT_MS
|
||||||
|
)
|
||||||
|
_ensure_not_challenged(detail_page)
|
||||||
|
html = detail_page.content()
|
||||||
|
result = parse_detail_geo(html, search_outcode=search_outcode)
|
||||||
|
except TurnstileError:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
log.debug("Zoopla detail fetch failed %s: %s", url, _exception_detail(exc))
|
||||||
|
result = None
|
||||||
|
|
||||||
|
_detail_cache[cache_key] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _map_property_type(raw_type: str | None) -> str:
|
def _map_property_type(raw_type: str | None) -> str:
|
||||||
"""Map Zoopla property type text to canonical type."""
|
"""Map Zoopla property type text to canonical type."""
|
||||||
if not raw_type:
|
if not raw_type:
|
||||||
|
|
@ -1109,28 +1363,64 @@ def transform_property(
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
pc_coords: dict[str, tuple[float, float]],
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
search_outcode: str | None = None,
|
search_outcode: str | None = None,
|
||||||
|
detail: dict | None = None,
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||||
|
|
||||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
Zoopla search cards only expose an outcode-level address, so precise
|
||||||
from postcodes extracted from the address text."""
|
location comes from the listing's detail page (see ``parse_detail_geo`` /
|
||||||
|
``_fetch_listing_detail``), passed in as ``detail``. When detail-page
|
||||||
|
coordinates are available we resolve the nearest postcode via the spatial
|
||||||
|
index — mirroring rightmove/onthemarket — and only fall back to the coarse
|
||||||
|
outcode centroid when no detail location could be obtained."""
|
||||||
price = parse_int_value(raw.get("price")) or 0
|
price = parse_int_value(raw.get("price")) or 0
|
||||||
|
|
||||||
address = raw.get("address", "") or ""
|
address = raw.get("address", "") or ""
|
||||||
|
|
||||||
# Resolve postcode and coordinates from address
|
|
||||||
extracted_postcode = extract_full_postcode(address)
|
extracted_postcode = extract_full_postcode(address)
|
||||||
postcode = extracted_postcode
|
detail = detail or {}
|
||||||
postcode_source = "address" if extracted_postcode else None
|
detail_postcode = extract_full_postcode(detail.get("postcode"))
|
||||||
|
# Detail-page address fields: the UPRN keys an exact EPC join, and the
|
||||||
|
# full street address / house number-or-name beat the outcode-level card
|
||||||
|
# address for the Price-Paid join. All three are absent unless the detail
|
||||||
|
# page was fetched, so every consumer must tolerate None.
|
||||||
|
detail_uprn = detail.get("uprn") or None
|
||||||
|
detail_full_address = detail.get("full_address") or None
|
||||||
|
detail_number_or_name = detail.get("number_or_name") or None
|
||||||
|
|
||||||
|
postcode = postcode_source = inferred_postcode = None
|
||||||
lat = lng = None
|
lat = lng = None
|
||||||
|
|
||||||
if postcode:
|
# (A) Best: detail-page coordinates -> nearest postcode (authoritative).
|
||||||
coords = pc_coords.get(postcode)
|
detail_lat, detail_lng = detail.get("lat"), detail.get("lng")
|
||||||
if coords:
|
if detail_lat is not None and detail_lng is not None:
|
||||||
lat, lng = coords
|
fixed_lat, fixed_lng = fix_coords(detail_lat, detail_lng)
|
||||||
|
if 49 <= fixed_lat <= 56 and -7 <= fixed_lng <= 2:
|
||||||
|
nearest = pc_index.nearest(fixed_lat, fixed_lng)
|
||||||
|
if nearest:
|
||||||
|
lat, lng, inferred_postcode = fixed_lat, fixed_lng, nearest
|
||||||
|
candidate = detail_postcode or extracted_postcode
|
||||||
|
postcode, resolved_source = resolve_listing_postcode(candidate, nearest)
|
||||||
|
postcode_source = (
|
||||||
|
"detail_address"
|
||||||
|
if resolved_source == "address"
|
||||||
|
else "detail_coordinates"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (B) Detail-page postcode without usable coordinates -> geocode it.
|
||||||
|
if lat is None and detail_postcode and detail_postcode in pc_coords:
|
||||||
|
lat, lng = pc_coords[detail_postcode]
|
||||||
|
postcode = inferred_postcode = detail_postcode
|
||||||
|
postcode_source = "detail_address"
|
||||||
|
|
||||||
|
# (C) Full postcode in the search-card address -> geocode it.
|
||||||
|
if lat is None and extracted_postcode and extracted_postcode in pc_coords:
|
||||||
|
lat, lng = pc_coords[extracted_postcode]
|
||||||
|
postcode = extracted_postcode
|
||||||
|
postcode_source = "address"
|
||||||
|
|
||||||
|
# (D) Last resort: coarse outcode-level centroid (loses per-listing precision).
|
||||||
if lat is None:
|
if lat is None:
|
||||||
# Try outcode-level fallback from address text
|
|
||||||
addr_outcode = _extract_outcode(address)
|
addr_outcode = _extract_outcode(address)
|
||||||
if addr_outcode:
|
if addr_outcode:
|
||||||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||||
|
|
@ -1138,7 +1428,6 @@ def transform_property(
|
||||||
postcode, lat, lng = result
|
postcode, lat, lng = result
|
||||||
postcode_source = "address_outcode"
|
postcode_source = "address_outcode"
|
||||||
|
|
||||||
# Final fallback: use the outcode we know we're searching
|
|
||||||
if lat is None and search_outcode:
|
if lat is None and search_outcode:
|
||||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||||
if result:
|
if result:
|
||||||
|
|
@ -1188,9 +1477,17 @@ def transform_property(
|
||||||
"Postcode": postcode,
|
"Postcode": postcode,
|
||||||
"Postcode source": postcode_source or "unknown",
|
"Postcode source": postcode_source or "unknown",
|
||||||
"Extracted postcode": extracted_postcode,
|
"Extracted postcode": extracted_postcode,
|
||||||
"Inferred postcode": postcode if postcode_source != "address" else None,
|
"Inferred postcode": (
|
||||||
"Listing raw address": address,
|
inferred_postcode
|
||||||
"Address per Property Register": clean_listing_address(address),
|
if inferred_postcode is not None
|
||||||
|
else (postcode if postcode_source != "address" else None)
|
||||||
|
),
|
||||||
|
"Listing raw address": detail_full_address or address,
|
||||||
|
"Address per Property Register": build_register_address(
|
||||||
|
detail_full_address or address, detail_number_or_name
|
||||||
|
),
|
||||||
|
"UPRN": detail_uprn,
|
||||||
|
"Property number or name": detail_number_or_name,
|
||||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||||
"Property type": _map_property_type(raw.get("property_type")),
|
"Property type": _map_property_type(raw.get("property_type")),
|
||||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||||
|
|
@ -1215,6 +1512,9 @@ def search_outcode(
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
pc_coords: dict[str, tuple[float, float]],
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
|
detail_page=None,
|
||||||
|
detail_cap: int = 0,
|
||||||
|
detail_budget_seconds: float | None = None,
|
||||||
) -> tuple[list[dict], str | None]:
|
) -> tuple[list[dict], str | None]:
|
||||||
"""Search Zoopla for properties in one outcode.
|
"""Search Zoopla for properties in one outcode.
|
||||||
|
|
||||||
|
|
@ -1222,6 +1522,12 @@ def search_outcode(
|
||||||
search flow, extracts listings from rendered DOM, and transforms to the
|
search flow, extracts listings from rendered DOM, and transforms to the
|
||||||
standard output schema.
|
standard output schema.
|
||||||
|
|
||||||
|
When ``detail_page`` (a second browser tab) and a positive ``detail_cap``
|
||||||
|
are supplied, up to ``detail_cap`` listings per outcode have their detail
|
||||||
|
page fetched for a precise postcode (see ``_fetch_listing_detail``).
|
||||||
|
``detail_budget_seconds`` caps the wall-clock time spent fetching details so
|
||||||
|
the per-outcode timeout that also guards search pagination is never starved.
|
||||||
|
|
||||||
Returns (properties, search_url).
|
Returns (properties, search_url).
|
||||||
|
|
||||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||||
|
|
@ -1231,12 +1537,25 @@ def search_outcode(
|
||||||
|
|
||||||
total_results = _get_result_count(page)
|
total_results = _get_result_count(page)
|
||||||
|
|
||||||
|
fetch_detail = None
|
||||||
|
detail_deadline = None
|
||||||
|
if detail_page is not None and detail_cap > 0:
|
||||||
|
fetch_detail = lambda url: _fetch_listing_detail( # noqa: E731
|
||||||
|
detail_page, url, search_outcode=outcode
|
||||||
|
)
|
||||||
|
if detail_budget_seconds is not None:
|
||||||
|
detail_deadline = time.monotonic() + detail_budget_seconds
|
||||||
|
|
||||||
# Always try extraction even if result count is 0 — the count regex may
|
# Always try extraction even if result count is 0 — the count regex may
|
||||||
# not match Zoopla's current text format, but listings may still be in DOM
|
# not match Zoopla's current text format, but listings may still be in DOM
|
||||||
raw_listings = _paginate(
|
raw_listings = _paginate(
|
||||||
page,
|
page,
|
||||||
total_results,
|
total_results,
|
||||||
max_properties=max_properties,
|
max_properties=max_properties,
|
||||||
|
fetch_detail=fetch_detail,
|
||||||
|
detail_cap=detail_cap,
|
||||||
|
detail_state={"fetched": 0},
|
||||||
|
detail_deadline=detail_deadline,
|
||||||
)
|
)
|
||||||
if not raw_listings:
|
if not raw_listings:
|
||||||
if total_results > 0:
|
if total_results > 0:
|
||||||
|
|
@ -1252,7 +1571,11 @@ def search_outcode(
|
||||||
for raw in raw_listings:
|
for raw in raw_listings:
|
||||||
try:
|
try:
|
||||||
transformed = transform_property(
|
transformed = transform_property(
|
||||||
raw, pc_index, pc_coords, search_outcode=outcode
|
raw,
|
||||||
|
pc_index,
|
||||||
|
pc_coords,
|
||||||
|
search_outcode=outcode,
|
||||||
|
detail=raw.get("_detail"),
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log.warning(
|
log.warning(
|
||||||
|
|
|
||||||
164
finder/zoopla_flaresolverr.py
Normal file
164
finder/zoopla_flaresolverr.py
Normal file
|
|
@ -0,0 +1,164 @@
|
||||||
|
"""Zoopla scraping via FlareSolverr (no browser/VNC needed).
|
||||||
|
|
||||||
|
FlareSolverr solves Zoopla's Cloudflare and returns the rendered HTML, which
|
||||||
|
still contains the React Server Components flight stream — so the existing pure
|
||||||
|
parsers work unchanged:
|
||||||
|
- the search page yields the outcode's listing detail URLs, and
|
||||||
|
- each detail page's flight stream carries the property's location object
|
||||||
|
(postcode + coordinates) that ``parse_detail_geo`` extracts, plus the
|
||||||
|
listing fields (price/beds/baths/tenure/floor area) parsed here.
|
||||||
|
|
||||||
|
Verified live (2026-05-30) against Zoopla through the Gluetun VPN: a warm
|
||||||
|
FlareSolverr session solves the SW9 search + detail pages and the flight data
|
||||||
|
is present (e.g. detail 73326946 -> SW9 0HD @ 51.477238,-0.116819).
|
||||||
|
|
||||||
|
This is selected by constants.ZOOPLA_FETCHER == "flaresolverr"; the Camoufox
|
||||||
|
path in zoopla.py remains for ZOOPLA_FETCHER == "camoufox".
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
from constants import DELAY_BETWEEN_PAGES, ZOOPLA_BASE
|
||||||
|
from flaresolverr import FlareSolverrError, FlareSolverrSession
|
||||||
|
from spatial import PostcodeSpatialIndex
|
||||||
|
from zoopla import _url_with_page, parse_detail_geo, transform_property
|
||||||
|
|
||||||
|
log = logging.getLogger("zoopla")
|
||||||
|
|
||||||
|
# Safety bound on how many search-result pages to walk per outcode.
|
||||||
|
_MAX_SERP_PAGES = 60
|
||||||
|
|
||||||
|
_DETAIL_PATH_RE = re.compile(r"/(?:for-sale|new-homes)/details/\d+/")
|
||||||
|
_LISTING_ID_RE = re.compile(r"/details/(\d+)/")
|
||||||
|
|
||||||
|
|
||||||
|
def _int(pattern: str, buf: str) -> int | None:
|
||||||
|
match = re.search(pattern, buf)
|
||||||
|
return int(match.group(1)) if match else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_listing(html: str) -> dict:
|
||||||
|
"""Extract the non-location listing fields from a Zoopla detail page.
|
||||||
|
|
||||||
|
Mirrors the fields the Camoufox SERP-card extractor produced, read from the
|
||||||
|
detail page's flight stream (validated against real Zoopla detail HTML).
|
||||||
|
All fields are best-effort; missing ones default to None so a listing with
|
||||||
|
a known location is still emitted."""
|
||||||
|
buf = html.replace('\\"', '"').replace("\\/", "/")
|
||||||
|
|
||||||
|
price = _int(r'"internalValue":(\d+)', buf)
|
||||||
|
if price is None:
|
||||||
|
price = _int(r'"priceUnformatted":(\d+)', buf)
|
||||||
|
|
||||||
|
tenure_match = re.search(r'"tenure":"([a-zA-Z]+)"', buf)
|
||||||
|
tenure = tenure_match.group(1).title() if tenure_match else None
|
||||||
|
|
||||||
|
# Address + property type come from the page <title>, e.g.
|
||||||
|
# "Caldwell Street, Stockwell SW9, 4 bed property for sale, £995,000 - Zoopla"
|
||||||
|
address = None
|
||||||
|
property_type = None
|
||||||
|
title_match = re.search(r'"children":"([^"]*? for sale[^"]*?)"', buf)
|
||||||
|
if title_match:
|
||||||
|
title = title_match.group(1)
|
||||||
|
addr_match = re.match(r"(.+?),\s*\d+\s*bed", title)
|
||||||
|
if addr_match:
|
||||||
|
address = addr_match.group(1).strip()
|
||||||
|
type_match = re.search(r"\d+\s*bed\s+([\w\s-]+?)\s+for sale", title)
|
||||||
|
if type_match:
|
||||||
|
property_type = type_match.group(1).strip()
|
||||||
|
explicit_type = re.search(r'"propertyType":"([^"]+)"', buf)
|
||||||
|
if explicit_type:
|
||||||
|
property_type = explicit_type.group(1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"price": price,
|
||||||
|
"beds": _int(r'"numBedrooms":(\d+)', buf),
|
||||||
|
"baths": _int(r'"numBaths":(\d+)', buf),
|
||||||
|
"receptions": _int(r'"numLivingRooms":(\d+)', buf),
|
||||||
|
"floor_area_sqft": _int(r'"sizeSqft":(\d+)', buf),
|
||||||
|
"tenure": tenure,
|
||||||
|
"property_type": property_type,
|
||||||
|
"address": address,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _enumerate_detail_paths(fs: FlareSolverrSession, outcode: str, limit: int | None) -> list[str]:
|
||||||
|
"""Walk the outcode's search-result pages and collect listing detail paths."""
|
||||||
|
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
||||||
|
seen: list[str] = []
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
for page_num in range(1, _MAX_SERP_PAGES + 1):
|
||||||
|
url = base if page_num == 1 else _url_with_page(base, page_num)
|
||||||
|
html = fs.get(url)
|
||||||
|
new = 0
|
||||||
|
for path in _DETAIL_PATH_RE.findall(html):
|
||||||
|
id_match = _LISTING_ID_RE.search(path)
|
||||||
|
listing_id = id_match.group(1) if id_match else path
|
||||||
|
if listing_id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(listing_id)
|
||||||
|
seen.append(path)
|
||||||
|
new += 1
|
||||||
|
if limit is not None and len(seen) >= limit:
|
||||||
|
return seen
|
||||||
|
if new == 0:
|
||||||
|
break
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
return seen
|
||||||
|
|
||||||
|
|
||||||
|
def search_outcode(
|
||||||
|
outcode: str,
|
||||||
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
|
fs: FlareSolverrSession,
|
||||||
|
max_properties: int | None = None,
|
||||||
|
detail_cap: int = 0,
|
||||||
|
detail_budget_seconds: float | None = None,
|
||||||
|
) -> tuple[list[dict], str | None]:
|
||||||
|
"""Scrape one outcode via FlareSolverr. Returns (properties, search_url).
|
||||||
|
|
||||||
|
Every listing's detail page is fetched (that is where the postcode lives),
|
||||||
|
so the effective listing count is bounded by both ``max_properties`` and
|
||||||
|
``detail_cap``; ``detail_budget_seconds`` caps wall-clock time on details."""
|
||||||
|
limit = detail_cap if detail_cap and detail_cap > 0 else None
|
||||||
|
if max_properties is not None:
|
||||||
|
limit = max_properties if limit is None else min(limit, max_properties)
|
||||||
|
|
||||||
|
base = f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/?q={outcode}&search_source=home"
|
||||||
|
paths = _enumerate_detail_paths(fs, outcode, limit)
|
||||||
|
if not paths:
|
||||||
|
return [], base
|
||||||
|
|
||||||
|
deadline = (time.monotonic() + detail_budget_seconds) if detail_budget_seconds else None
|
||||||
|
properties: list[dict] = []
|
||||||
|
dropped = 0
|
||||||
|
for path in paths:
|
||||||
|
if deadline is not None and time.monotonic() >= deadline:
|
||||||
|
log.info("Zoopla %s: detail-fetch budget reached after %d", outcode, len(properties))
|
||||||
|
break
|
||||||
|
id_match = _LISTING_ID_RE.search(path)
|
||||||
|
listing_id = id_match.group(1) if id_match else path
|
||||||
|
try:
|
||||||
|
html = fs.get(ZOOPLA_BASE + path)
|
||||||
|
geo = parse_detail_geo(html, search_outcode=outcode)
|
||||||
|
raw = {"id": listing_id, "url": path, **parse_detail_listing(html)}
|
||||||
|
prop = transform_property(
|
||||||
|
raw, pc_index, pc_coords, search_outcode=outcode, detail=geo
|
||||||
|
)
|
||||||
|
except FlareSolverrError as exc:
|
||||||
|
log.warning("Zoopla %s detail %s fetch failed: %s", outcode, listing_id, exc)
|
||||||
|
prop = None
|
||||||
|
except Exception as exc: # noqa: BLE001 - never let one listing kill the outcode
|
||||||
|
log.warning("Zoopla %s detail %s transform failed: %s", outcode, listing_id, exc)
|
||||||
|
prop = None
|
||||||
|
if prop:
|
||||||
|
properties.append(prop)
|
||||||
|
else:
|
||||||
|
dropped += 1
|
||||||
|
time.sleep(DELAY_BETWEEN_PAGES)
|
||||||
|
|
||||||
|
log.info("Zoopla %s: %d listings (%d dropped)", outcode, len(properties), dropped)
|
||||||
|
return properties, base
|
||||||
|
|
@ -606,12 +606,13 @@ function OverlayTileLayers({
|
||||||
const showTrees = activeOverlays.has('trees-outside-woodlands');
|
const showTrees = activeOverlays.has('trees-outside-woodlands');
|
||||||
const showPropertyBorders = activeOverlays.has('property-borders');
|
const showPropertyBorders = activeOverlays.has('property-borders');
|
||||||
|
|
||||||
// Restrict the heatmap to the selected crime types. When every type is
|
// Restrict the heatmap to the selected crime types. This must always be a
|
||||||
// selected we omit the filter entirely so all features contribute.
|
// concrete expression: passing `filter={undefined}` makes react-map-gl call
|
||||||
const crimeFilter =
|
// map.addLayer({filter: undefined}), which MapLibre rejects at validation
|
||||||
activeCrimeTypes.size >= CRIME_TYPE_VALUES.length
|
// ("filter: array expected, undefined found"), so the layer is never created
|
||||||
? undefined
|
// and the heatmap stays blank until a later setFilter call. An `in` over the
|
||||||
: ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]];
|
// selected types matches everything when all 14 are selected.
|
||||||
|
const crimeFilter = ['in', ['get', 'crime_type'], ['literal', Array.from(activeCrimeTypes)]];
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
|
|
|
||||||
107
frontend/src/components/map/MobileDrawer.test.tsx
Normal file
107
frontend/src/components/map/MobileDrawer.test.tsx
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
import { cleanup, fireEvent, render, screen } from '@testing-library/react';
|
||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||||
|
|
||||||
|
import MobileDrawer from './MobileDrawer';
|
||||||
|
|
||||||
|
vi.mock('react-i18next', () => ({
|
||||||
|
useTranslation: () => ({
|
||||||
|
t: (key: string) => key,
|
||||||
|
}),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const originalSetPointerCapture = HTMLElement.prototype.setPointerCapture;
|
||||||
|
|
||||||
|
function renderDrawer(onClose = vi.fn()) {
|
||||||
|
const view = render(
|
||||||
|
<MobileDrawer
|
||||||
|
onClose={onClose}
|
||||||
|
renderArea={() => <div>Area content</div>}
|
||||||
|
renderProperties={() => <div>Properties content</div>}
|
||||||
|
tab="area"
|
||||||
|
onTabChange={vi.fn()}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
const handle = view.container.querySelector('[data-mobile-drawer-drag-handle]');
|
||||||
|
const root = view.container.querySelector('[data-tutorial="right-pane"]');
|
||||||
|
const panel = view.container.querySelector('[data-tutorial="right-pane"] > div:last-child');
|
||||||
|
|
||||||
|
if (!(handle instanceof HTMLElement)) throw new Error('Expected drawer drag handle');
|
||||||
|
if (!(root instanceof HTMLElement)) throw new Error('Expected drawer root');
|
||||||
|
if (!(panel instanceof HTMLElement)) throw new Error('Expected drawer panel');
|
||||||
|
|
||||||
|
return { ...view, handle, onClose, panel, root };
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('MobileDrawer', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
HTMLElement.prototype.setPointerCapture = vi.fn();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
cleanup();
|
||||||
|
Object.defineProperty(HTMLElement.prototype, 'setPointerCapture', {
|
||||||
|
configurable: true,
|
||||||
|
value: originalSetPointerCapture,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('lowers and stays open when swiped down from the handle', () => {
|
||||||
|
const { handle, onClose, panel } = renderDrawer();
|
||||||
|
|
||||||
|
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
|
||||||
|
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
|
||||||
|
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
|
||||||
|
|
||||||
|
expect(onClose).not.toHaveBeenCalled();
|
||||||
|
expect(panel.style.transform).toBe('translateY(110px)');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('can be raised again after being lowered', () => {
|
||||||
|
const { handle, onClose, panel } = renderDrawer();
|
||||||
|
|
||||||
|
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
|
||||||
|
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 230 });
|
||||||
|
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 230 });
|
||||||
|
|
||||||
|
fireEvent.pointerDown(handle, { pointerId: 2, clientY: 230 });
|
||||||
|
fireEvent.pointerMove(handle, { pointerId: 2, clientY: 170 });
|
||||||
|
fireEvent.pointerUp(handle, { pointerId: 2, clientY: 170 });
|
||||||
|
|
||||||
|
expect(onClose).not.toHaveBeenCalled();
|
||||||
|
expect(panel.style.transform).toBe('translateY(50px)');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('keeps the close control reachable when dragged down far', () => {
|
||||||
|
const { handle, panel } = renderDrawer();
|
||||||
|
|
||||||
|
Object.defineProperty(panel, 'offsetHeight', {
|
||||||
|
configurable: true,
|
||||||
|
value: 200,
|
||||||
|
});
|
||||||
|
|
||||||
|
fireEvent.pointerDown(handle, { pointerId: 1, clientY: 120 });
|
||||||
|
fireEvent.pointerMove(handle, { pointerId: 1, clientY: 420 });
|
||||||
|
fireEvent.pointerUp(handle, { pointerId: 1, clientY: 420 });
|
||||||
|
|
||||||
|
expect(panel.style.transform).toBe('translateY(96px)');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('leaves the rest of the mobile map usable while the panel is open', () => {
|
||||||
|
const { panel, root } = renderDrawer();
|
||||||
|
const spacer = root.firstElementChild;
|
||||||
|
|
||||||
|
if (!(spacer instanceof HTMLElement)) throw new Error('Expected drawer spacer');
|
||||||
|
|
||||||
|
expect(root.className).toContain('pointer-events-none');
|
||||||
|
expect(panel.className).toContain('pointer-events-auto');
|
||||||
|
expect(spacer.className).not.toContain('bg-black');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('closes from the close button', () => {
|
||||||
|
const { onClose } = renderDrawer();
|
||||||
|
|
||||||
|
fireEvent.click(screen.getByLabelText('mobileDrawer.closeDrawer'));
|
||||||
|
|
||||||
|
expect(onClose).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
11
frontend/src/lib/color-opacity.ts
Normal file
11
frontend/src/lib/color-opacity.ts
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
export const DEFAULT_COLOR_OPACITY = 1;
|
||||||
|
export const MIN_COLOR_OPACITY = 0.1;
|
||||||
|
|
||||||
|
export function normalizeColorOpacity(value: number | null | undefined): number {
|
||||||
|
if (value == null || !Number.isFinite(value)) return DEFAULT_COLOR_OPACITY;
|
||||||
|
return Math.min(1, Math.max(MIN_COLOR_OPACITY, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function colorOpacityToPercent(value: number): number {
|
||||||
|
return Math.round(normalizeColorOpacity(value) * 100);
|
||||||
|
}
|
||||||
35
frontend/src/lib/crime-types.ts
Normal file
35
frontend/src/lib/crime-types.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
// Street-crime categories carried by the `crime_hotspots` vector tiles in the
|
||||||
|
// `crime_type` feature property. The `value` strings must match the police.uk
|
||||||
|
// "Crime type" values exactly (see pipeline/transform/crime_hotspot_tiles.py),
|
||||||
|
// because they are used directly in the MapLibre heatmap `filter` expression.
|
||||||
|
// `label` is a shorter, human-friendly name for the overlay-selector checkboxes.
|
||||||
|
|
||||||
|
export interface CrimeTypeDef {
|
||||||
|
value: string;
|
||||||
|
label: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const CRIME_TYPES: readonly CrimeTypeDef[] = [
|
||||||
|
{ value: 'Violence and sexual offences', label: 'Violence & sexual offences' },
|
||||||
|
{ value: 'Anti-social behaviour', label: 'Anti-social behaviour' },
|
||||||
|
{ value: 'Criminal damage and arson', label: 'Criminal damage & arson' },
|
||||||
|
{ value: 'Public order', label: 'Public order' },
|
||||||
|
{ value: 'Shoplifting', label: 'Shoplifting' },
|
||||||
|
{ value: 'Vehicle crime', label: 'Vehicle crime' },
|
||||||
|
{ value: 'Burglary', label: 'Burglary' },
|
||||||
|
{ value: 'Other theft', label: 'Other theft' },
|
||||||
|
{ value: 'Theft from the person', label: 'Theft from the person' },
|
||||||
|
{ value: 'Bicycle theft', label: 'Bicycle theft' },
|
||||||
|
{ value: 'Drugs', label: 'Drugs' },
|
||||||
|
{ value: 'Robbery', label: 'Robbery' },
|
||||||
|
{ value: 'Possession of weapons', label: 'Possession of weapons' },
|
||||||
|
{ value: 'Other crime', label: 'Other crime' },
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
export const CRIME_TYPE_VALUES: readonly string[] = CRIME_TYPES.map((c) => c.value);
|
||||||
|
|
||||||
|
const CRIME_TYPE_VALUE_SET = new Set<string>(CRIME_TYPE_VALUES);
|
||||||
|
|
||||||
|
export function isCrimeTypeValue(value: string): boolean {
|
||||||
|
return CRIME_TYPE_VALUE_SET.has(value);
|
||||||
|
}
|
||||||
|
|
@ -4,7 +4,10 @@ Downloads GML files for all local authorities from the INSPIRE download page.
|
||||||
Each ZIP contains a GML file with title extent polygons for that authority.
|
Each ZIP contains a GML file with title extent polygons for that authority.
|
||||||
|
|
||||||
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
||||||
License: INSPIRE End User Licence
|
License: Open Government Licence v3.0 (since 1 July 2020, under the PSGA).
|
||||||
|
Requires HM Land Registry + Ordnance Survey (AC0000851063) attribution; see
|
||||||
|
the conditions page at the source URL. Boundaries are indicative "general
|
||||||
|
boundaries", not the legal extent of title.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
|
||||||
505
pipeline/download/satellite_highres.py
Normal file
505
pipeline/download/satellite_highres.py
Normal file
|
|
@ -0,0 +1,505 @@
|
||||||
|
"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
|
||||||
|
|
||||||
|
The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
|
||||||
|
(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
|
||||||
|
British National Grid. There is no public imagery tile service, so we mirror the
|
||||||
|
Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
|
||||||
|
an area of interest, pick the best RGB capture per OS tile, download and decode the
|
||||||
|
ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
|
||||||
|
archive that the server stacks *over* the Sentinel-2 base where coverage exists.
|
||||||
|
|
||||||
|
ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
|
||||||
|
SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
|
||||||
|
runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
|
||||||
|
rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import sqlite3
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
import zipfile
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.download.tiles import ensure_pmtiles_cli
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
|
# Defra Data Services Platform survey download API (reverse-engineered from the
|
||||||
|
# environment.data.gov.uk/survey front-end; no official API is documented).
|
||||||
|
SEARCH_URL = (
|
||||||
|
"https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
|
||||||
|
)
|
||||||
|
SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
|
||||||
|
# Static public key baked into the survey page JS. May rotate -- we try to scrape a
|
||||||
|
# fresh one from the page and only fall back to this literal.
|
||||||
|
DEFAULT_SUBSCRIPTION_KEY = "dspui"
|
||||||
|
SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
|
||||||
|
|
||||||
|
# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
|
||||||
|
VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
|
||||||
|
|
||||||
|
# Greater London bounding box (lon/lat). The API only returns tiles where coverage
|
||||||
|
# exists, so a generous bbox is fine -- it does not force blank downloads.
|
||||||
|
DEFAULT_AOI: dict = {
|
||||||
|
"type": "Polygon",
|
||||||
|
"coordinates": [
|
||||||
|
[
|
||||||
|
[-0.55, 51.25],
|
||||||
|
[0.30, 51.25],
|
||||||
|
[0.30, 51.70],
|
||||||
|
[-0.55, 51.70],
|
||||||
|
[-0.55, 51.25],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_MIN_ZOOM = 14
|
||||||
|
DEFAULT_MAX_ZOOM = 19
|
||||||
|
# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
|
||||||
|
# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
|
||||||
|
DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
|
||||||
|
USER_AGENT = "perfect-postcode-satellite-highres/1.0"
|
||||||
|
ATTRIBUTION_TEMPLATE = (
|
||||||
|
"Environment Agency Vertical Aerial Photography - "
|
||||||
|
"© Environment Agency copyright and/or database right {year}. "
|
||||||
|
"All rights reserved. Licensed under the Open Government Licence v3.0."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class VapTile:
|
||||||
|
"""One survey download record from the Defra search API."""
|
||||||
|
|
||||||
|
product_id: str
|
||||||
|
year: int
|
||||||
|
resolution_m: float
|
||||||
|
os_tile_id: str
|
||||||
|
uri: str
|
||||||
|
label: str
|
||||||
|
|
||||||
|
|
||||||
|
def parse_search_results(payload: dict) -> list[VapTile]:
|
||||||
|
"""Turn a raw search-API JSON payload into typed records."""
|
||||||
|
tiles: list[VapTile] = []
|
||||||
|
for result in payload.get("results", []):
|
||||||
|
try:
|
||||||
|
tiles.append(
|
||||||
|
VapTile(
|
||||||
|
product_id=result["product"]["id"],
|
||||||
|
year=int(result["year"]["id"]),
|
||||||
|
resolution_m=float(result["resolution"]["id"]),
|
||||||
|
os_tile_id=result["tile"]["id"],
|
||||||
|
uri=result["uri"],
|
||||||
|
label=result.get("label", ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except (KeyError, TypeError, ValueError):
|
||||||
|
# Skip malformed records rather than failing the whole search.
|
||||||
|
continue
|
||||||
|
return tiles
|
||||||
|
|
||||||
|
|
||||||
|
def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
|
||||||
|
"""Pick one RGB capture per OS tile: finest resolution, then latest year.
|
||||||
|
|
||||||
|
Pure function -- the unit test exercises this against a real-shaped payload.
|
||||||
|
"""
|
||||||
|
best: dict[str, VapTile] = {}
|
||||||
|
for tile in tiles:
|
||||||
|
if tile.product_id != VAP_RGB_PRODUCT:
|
||||||
|
continue
|
||||||
|
current = best.get(tile.os_tile_id)
|
||||||
|
if current is None or _is_better(tile, current):
|
||||||
|
best[tile.os_tile_id] = tile
|
||||||
|
return [best[key] for key in sorted(best)]
|
||||||
|
|
||||||
|
|
||||||
|
def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
|
||||||
|
"""Finer resolution wins; ties broken by the most recent survey year."""
|
||||||
|
if candidate.resolution_m != incumbent.resolution_m:
|
||||||
|
return candidate.resolution_m < incumbent.resolution_m
|
||||||
|
return candidate.year > incumbent.year
|
||||||
|
|
||||||
|
|
||||||
|
def _http_get(url: str, timeout: float) -> bytes:
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||||
|
return response.read()
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
|
||||||
|
"""Use an explicit key, else scrape the survey page JS, else the known default."""
|
||||||
|
if explicit:
|
||||||
|
return explicit
|
||||||
|
try:
|
||||||
|
page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
|
||||||
|
match = SUBSCRIPTION_KEY_RE.search(page)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
# The key usually lives in a referenced JS chunk; scan the largest one.
|
||||||
|
for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
|
||||||
|
js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
|
||||||
|
match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
||||||
|
print(f"Could not scrape subscription key ({err}); using default", flush=True)
|
||||||
|
return DEFAULT_SUBSCRIPTION_KEY
|
||||||
|
|
||||||
|
|
||||||
|
def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
|
||||||
|
"""POST the area-of-interest polygon and return the RGB tiles to download."""
|
||||||
|
body = json.dumps(aoi).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
SEARCH_URL,
|
||||||
|
data=body,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/geo+json",
|
||||||
|
"Referer": SURVEY_PAGE_URL,
|
||||||
|
"User-Agent": USER_AGENT,
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||||
|
payload = json.load(response)
|
||||||
|
selected = select_best_rgb_tiles(parse_search_results(payload))
|
||||||
|
print(
|
||||||
|
f"Search returned {payload.get('count', 0)} records; "
|
||||||
|
f"selected {len(selected)} RGB tile(s)",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def _download_and_extract(
|
||||||
|
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
|
||||||
|
) -> list[Path]:
|
||||||
|
"""Download one survey zip and extract its ECW raster(s)."""
|
||||||
|
url = f"{tile.uri}?subscription-key={key}"
|
||||||
|
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
|
||||||
|
last_error: Exception | None = None
|
||||||
|
for attempt in range(retries + 1):
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(
|
||||||
|
urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
|
||||||
|
timeout=timeout,
|
||||||
|
) as response, zip_path.open("wb") as out:
|
||||||
|
shutil.copyfileobj(response, out, length=1 << 20)
|
||||||
|
break
|
||||||
|
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
||||||
|
last_error = err
|
||||||
|
if attempt == retries:
|
||||||
|
raise RuntimeError(f"Failed to download {url}: {err}") from err
|
||||||
|
extracted: list[Path] = []
|
||||||
|
with zipfile.ZipFile(zip_path) as archive:
|
||||||
|
for member in archive.infolist():
|
||||||
|
if member.is_dir() or not member.filename.lower().endswith(".ecw"):
|
||||||
|
continue
|
||||||
|
target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
|
||||||
|
with archive.open(member) as src, target.open("wb") as dst:
|
||||||
|
shutil.copyfileobj(src, dst, length=1 << 20)
|
||||||
|
extracted.append(target)
|
||||||
|
zip_path.unlink(missing_ok=True)
|
||||||
|
if not extracted:
|
||||||
|
print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def download_tiles(
|
||||||
|
tiles: list[VapTile],
|
||||||
|
ecw_dir: Path,
|
||||||
|
key: str,
|
||||||
|
max_workers: int,
|
||||||
|
timeout: float,
|
||||||
|
retries: int,
|
||||||
|
) -> list[Path]:
|
||||||
|
"""Download every selected tile concurrently; return all extracted ECW paths."""
|
||||||
|
ecw_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
ecw_paths: list[Path] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
futures = {
|
||||||
|
executor.submit(
|
||||||
|
_download_and_extract, tile, ecw_dir, key, timeout, retries
|
||||||
|
): tile
|
||||||
|
for tile in tiles
|
||||||
|
}
|
||||||
|
done = 0
|
||||||
|
for future in as_completed(futures):
|
||||||
|
tile = futures[future]
|
||||||
|
ecw_paths.extend(future.result())
|
||||||
|
done += 1
|
||||||
|
print(
|
||||||
|
f"Downloaded {done}/{len(tiles)} tiles "
|
||||||
|
f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return ecw_paths
|
||||||
|
|
||||||
|
|
||||||
|
def _build_tiles_with_gdal(
|
||||||
|
work_dir: Path,
|
||||||
|
gdal_image: str,
|
||||||
|
min_zoom: int,
|
||||||
|
max_zoom: int,
|
||||||
|
jobs: int,
|
||||||
|
webp_quality: int,
|
||||||
|
) -> Path:
|
||||||
|
"""Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
|
||||||
|
|
||||||
|
Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
|
||||||
|
an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
|
||||||
|
keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
|
||||||
|
shows through them.
|
||||||
|
"""
|
||||||
|
xyz_dir = work_dir / "xyz"
|
||||||
|
# EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
|
||||||
|
# so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
|
||||||
|
# exceed PNG's 4-band limit). We then:
|
||||||
|
# * force EPSG:27700 -- the pixels are already British National Grid, and the
|
||||||
|
# EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
|
||||||
|
# for metre-accurate reprojection to Web Mercator;
|
||||||
|
# * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
|
||||||
|
# gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
|
||||||
|
# Sentinel-2 base show through wherever VAP coverage is missing.
|
||||||
|
script = (
|
||||||
|
"set -euo pipefail; "
|
||||||
|
"cd /work; "
|
||||||
|
"gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
|
||||||
|
"gdal_edit.py -a_srs EPSG:27700 "
|
||||||
|
"-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
|
||||||
|
"-colorinterp_4 alpha mosaic.vrt; "
|
||||||
|
f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
|
||||||
|
f"--processes={jobs} --resampling=average --webviewer=none "
|
||||||
|
f"--tiledriver=WEBP --webp-quality={webp_quality} "
|
||||||
|
"mosaic.vrt xyz"
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"docker",
|
||||||
|
"run",
|
||||||
|
"--rm",
|
||||||
|
"-v",
|
||||||
|
f"{work_dir.resolve()}:/work",
|
||||||
|
gdal_image,
|
||||||
|
"bash",
|
||||||
|
"-c",
|
||||||
|
script,
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
if not xyz_dir.exists():
|
||||||
|
raise RuntimeError("gdal2tiles produced no output directory")
|
||||||
|
return xyz_dir
|
||||||
|
|
||||||
|
|
||||||
|
def _pack_xyz_to_mbtiles(
|
||||||
|
xyz_dir: Path,
|
||||||
|
mbtiles_path: Path,
|
||||||
|
bounds: tuple[float, float, float, float],
|
||||||
|
min_zoom: int,
|
||||||
|
max_zoom: int,
|
||||||
|
attribution: str,
|
||||||
|
) -> int:
|
||||||
|
"""Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
|
||||||
|
if mbtiles_path.exists():
|
||||||
|
mbtiles_path.unlink()
|
||||||
|
conn = sqlite3.connect(mbtiles_path)
|
||||||
|
try:
|
||||||
|
conn.execute("PRAGMA journal_mode = WAL")
|
||||||
|
conn.execute("PRAGMA synchronous = NORMAL")
|
||||||
|
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
|
||||||
|
"tile_row INTEGER, tile_data BLOB)"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"CREATE UNIQUE INDEX tile_index ON tiles "
|
||||||
|
"(zoom_level, tile_column, tile_row)"
|
||||||
|
)
|
||||||
|
conn.executemany(
|
||||||
|
"INSERT INTO metadata (name, value) VALUES (?, ?)",
|
||||||
|
[
|
||||||
|
("name", "EA Vertical Aerial Photography"),
|
||||||
|
("type", "overlay"),
|
||||||
|
("version", "1"),
|
||||||
|
("description", "Environment Agency high-resolution aerial imagery"),
|
||||||
|
("format", "webp"),
|
||||||
|
("attribution", attribution),
|
||||||
|
("bounds", ",".join(f"{value:.6f}" for value in bounds)),
|
||||||
|
("minzoom", str(min_zoom)),
|
||||||
|
("maxzoom", str(max_zoom)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
inserted = 0
|
||||||
|
for zoom_dir in sorted(xyz_dir.iterdir()):
|
||||||
|
if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
|
||||||
|
continue
|
||||||
|
zoom = int(zoom_dir.name)
|
||||||
|
for col_dir in zoom_dir.iterdir():
|
||||||
|
if not col_dir.is_dir() or not col_dir.name.isdigit():
|
||||||
|
continue
|
||||||
|
col = int(col_dir.name)
|
||||||
|
for tile_file in col_dir.glob("*.webp"):
|
||||||
|
if not tile_file.stem.isdigit():
|
||||||
|
continue
|
||||||
|
row = int(tile_file.stem)
|
||||||
|
tms_row = (1 << zoom) - 1 - row
|
||||||
|
conn.execute(
|
||||||
|
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
|
||||||
|
(zoom, col, tms_row, tile_file.read_bytes()),
|
||||||
|
)
|
||||||
|
inserted += 1
|
||||||
|
if inserted % 5000 == 0:
|
||||||
|
conn.commit()
|
||||||
|
print(f" packed {inserted:,} tiles", flush=True)
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
return inserted
|
||||||
|
|
||||||
|
|
||||||
|
def build_satellite_highres_tiles(
|
||||||
|
output_path: Path,
|
||||||
|
pmtiles_bin: Path,
|
||||||
|
pmtiles_version: str,
|
||||||
|
aoi: dict,
|
||||||
|
min_zoom: int,
|
||||||
|
max_zoom: int,
|
||||||
|
gdal_image: str,
|
||||||
|
subscription_key: str | None,
|
||||||
|
max_workers: int,
|
||||||
|
timeout: float,
|
||||||
|
retries: int,
|
||||||
|
jobs: int,
|
||||||
|
webp_quality: int,
|
||||||
|
) -> None:
|
||||||
|
if min_zoom > max_zoom:
|
||||||
|
raise ValueError("--min-zoom must be <= --max-zoom")
|
||||||
|
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
|
||||||
|
|
||||||
|
tiles = search_vap_tiles(aoi)
|
||||||
|
if not tiles:
|
||||||
|
raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
|
||||||
|
key = resolve_subscription_key(subscription_key)
|
||||||
|
attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||||
|
work_dir = Path(tmp)
|
||||||
|
ecw_dir = work_dir / "ecw"
|
||||||
|
ecw_paths = download_tiles(
|
||||||
|
tiles, ecw_dir, key, max_workers, timeout, retries
|
||||||
|
)
|
||||||
|
if not ecw_paths:
|
||||||
|
raise RuntimeError("No ECW rasters were extracted from the downloads")
|
||||||
|
|
||||||
|
xyz_dir = _build_tiles_with_gdal(
|
||||||
|
work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
|
||||||
|
)
|
||||||
|
|
||||||
|
mbtiles_path = work_dir / "satellite_highres.mbtiles"
|
||||||
|
bounds = _aoi_bounds(aoi)
|
||||||
|
inserted = _pack_xyz_to_mbtiles(
|
||||||
|
xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
|
||||||
|
)
|
||||||
|
if inserted == 0:
|
||||||
|
raise RuntimeError("Tiling produced no tiles to pack")
|
||||||
|
print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||||
|
print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
|
||||||
|
coords = [point for ring in aoi["coordinates"] for point in ring]
|
||||||
|
lons = [point[0] for point in coords]
|
||||||
|
lats = [point[1] for point in coords]
|
||||||
|
return min(lons), min(lats), max(lons), max(lats)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_aoi(path: Path | None) -> dict:
|
||||||
|
if path is None:
|
||||||
|
return DEFAULT_AOI
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
if data.get("type") == "FeatureCollection":
|
||||||
|
return data["features"][0]["geometry"]
|
||||||
|
if data.get("type") == "Feature":
|
||||||
|
return data["geometry"]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--output", type=Path, required=True)
|
||||||
|
parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
|
||||||
|
parser.add_argument("--pmtiles-version", default="1.22.3")
|
||||||
|
parser.add_argument(
|
||||||
|
"--aoi-geojson",
|
||||||
|
type=Path,
|
||||||
|
default=None,
|
||||||
|
help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
|
||||||
|
"(default: Greater London)",
|
||||||
|
)
|
||||||
|
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
|
||||||
|
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gdal-image",
|
||||||
|
default=DEFAULT_GDAL_IMAGE,
|
||||||
|
help="Docker image with a GDAL that has the ECW driver",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--subscription-key",
|
||||||
|
default=None,
|
||||||
|
help="Override the Defra survey API key (default: scrape, then 'dspui')",
|
||||||
|
)
|
||||||
|
parser.add_argument("--max-workers", type=int, default=4)
|
||||||
|
parser.add_argument("--timeout", type=float, default=600.0)
|
||||||
|
parser.add_argument("--retries", type=int, default=3)
|
||||||
|
parser.add_argument(
|
||||||
|
"--jobs",
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help="Parallel processes for gdal2tiles",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--webp-quality",
|
||||||
|
type=int,
|
||||||
|
default=85,
|
||||||
|
help="WebP tile quality (1-100); lower is smaller",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
build_satellite_highres_tiles(
|
||||||
|
output_path=args.output,
|
||||||
|
pmtiles_bin=args.pmtiles_bin,
|
||||||
|
pmtiles_version=args.pmtiles_version,
|
||||||
|
aoi=_load_aoi(args.aoi_geojson),
|
||||||
|
min_zoom=args.min_zoom,
|
||||||
|
max_zoom=args.max_zoom,
|
||||||
|
gdal_image=args.gdal_image,
|
||||||
|
subscription_key=args.subscription_key,
|
||||||
|
max_workers=max(1, args.max_workers),
|
||||||
|
timeout=args.timeout,
|
||||||
|
retries=max(0, args.retries),
|
||||||
|
jobs=max(1, args.jobs),
|
||||||
|
webp_quality=args.webp_quality,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
97
pipeline/download/test_satellite_highres.py
Normal file
97
pipeline/download/test_satellite_highres.py
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
from pipeline.download import satellite_highres
|
||||||
|
from pipeline.download.satellite_highres import (
|
||||||
|
VapTile,
|
||||||
|
parse_search_results,
|
||||||
|
select_best_rgb_tiles,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _result(product: str, year: str, resolution: str, tile: str) -> dict:
|
||||||
|
"""One search-API record in the real response shape."""
|
||||||
|
return {
|
||||||
|
"product": {"id": product, "label": product},
|
||||||
|
"year": {"id": year, "label": year},
|
||||||
|
"resolution": {"id": resolution, "label": f"{resolution}m"},
|
||||||
|
"tile": {"id": tile, "label": tile},
|
||||||
|
"label": f"{product}-{year}-{resolution}m-{tile}",
|
||||||
|
"uri": (
|
||||||
|
"https://environment.data.gov.uk/tiles/collections/survey/"
|
||||||
|
f"{product}/{year}/{resolution}/{tile}"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011),
|
||||||
|
# plus Night Time and LIDAR products that must be ignored.
|
||||||
|
SAMPLE_PAYLOAD = {
|
||||||
|
"count": 6,
|
||||||
|
"results": [
|
||||||
|
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
|
||||||
|
_result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"),
|
||||||
|
_result("lidar_composite_dtm", "2022", "1", "TQ2575"),
|
||||||
|
# TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer.
|
||||||
|
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"),
|
||||||
|
_result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"),
|
||||||
|
_result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_search_results_skips_malformed_records() -> None:
|
||||||
|
payload = {
|
||||||
|
"results": [
|
||||||
|
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
|
||||||
|
{"product": {"id": "broken"}}, # missing year/resolution/tile/uri
|
||||||
|
]
|
||||||
|
}
|
||||||
|
tiles = parse_search_results(payload)
|
||||||
|
assert len(tiles) == 1
|
||||||
|
assert tiles[0] == VapTile(
|
||||||
|
product_id="vertical_aerial_photography_tiles_rgb",
|
||||||
|
year=2008,
|
||||||
|
resolution_m=0.4,
|
||||||
|
os_tile_id="TQ2575",
|
||||||
|
uri="https://environment.data.gov.uk/tiles/collections/survey/"
|
||||||
|
"vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575",
|
||||||
|
label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_best_rgb_filters_non_rgb_products() -> None:
|
||||||
|
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||||
|
assert {tile.product_id for tile in selected} == {
|
||||||
|
satellite_highres.VAP_RGB_PRODUCT
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_best_rgb_one_tile_per_os_square() -> None:
|
||||||
|
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||||
|
assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None:
|
||||||
|
selected = {
|
||||||
|
tile.os_tile_id: tile
|
||||||
|
for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||||
|
}
|
||||||
|
# TQ2575: only one RGB capture.
|
||||||
|
assert selected["TQ2575"].resolution_m == 0.4
|
||||||
|
# TQ3080: finest resolution (0.1m) wins even though it is the older survey.
|
||||||
|
assert selected["TQ3080"].resolution_m == 0.1
|
||||||
|
assert selected["TQ3080"].year == 2008
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
|
||||||
|
tiles = [
|
||||||
|
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"),
|
||||||
|
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"),
|
||||||
|
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"),
|
||||||
|
]
|
||||||
|
selected = select_best_rgb_tiles(tiles)
|
||||||
|
assert len(selected) == 1
|
||||||
|
assert selected[0].year == 2018
|
||||||
|
|
||||||
|
|
||||||
|
def test_select_best_rgb_empty_when_no_rgb() -> None:
|
||||||
|
payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
|
||||||
|
assert select_best_rgb_tiles(parse_search_results(payload)) == []
|
||||||
|
|
@ -1,12 +1,25 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import json
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from pipeline.validate_outputs import main
|
from pipeline.validate_outputs import main
|
||||||
|
|
||||||
|
|
||||||
|
def write_boundary(path, postcodes):
|
||||||
|
units = path / "units"
|
||||||
|
units.mkdir(parents=True)
|
||||||
|
features = [
|
||||||
|
{"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None}
|
||||||
|
for postcode in postcodes
|
||||||
|
]
|
||||||
|
(units / "AA1.geojson").write_text(
|
||||||
|
json.dumps({"type": "FeatureCollection", "features": features})
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
|
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
|
||||||
parquet_path = tmp_path / "data.parquet"
|
parquet_path = tmp_path / "data.parquet"
|
||||||
file_path = tmp_path / "plain.txt"
|
file_path = tmp_path / "plain.txt"
|
||||||
|
|
@ -59,3 +72,42 @@ def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
|
||||||
assert "empty file" in stderr
|
assert "empty file" in stderr
|
||||||
assert "missing" in stderr
|
assert "missing" in stderr
|
||||||
assert "no files matched" in stderr
|
assert "no files matched" in stderr
|
||||||
|
|
||||||
|
|
||||||
|
def test_validates_postcode_boundary_matches(tmp_path, monkeypatch):
|
||||||
|
postcodes_path = tmp_path / "postcodes.parquet"
|
||||||
|
boundaries_path = tmp_path / "postcode_boundaries"
|
||||||
|
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
|
||||||
|
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"])
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"sys.argv",
|
||||||
|
[
|
||||||
|
"validate_outputs",
|
||||||
|
"--postcode-boundary-match",
|
||||||
|
f"{postcodes_path}::{boundaries_path}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert main() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
|
||||||
|
postcodes_path = tmp_path / "postcodes.parquet"
|
||||||
|
boundaries_path = tmp_path / "postcode_boundaries"
|
||||||
|
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
|
||||||
|
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"])
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"sys.argv",
|
||||||
|
[
|
||||||
|
"validate_outputs",
|
||||||
|
"--postcode-boundary-match",
|
||||||
|
f"{postcodes_path}::{boundaries_path}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert main() == 1
|
||||||
|
stderr = capsys.readouterr().err
|
||||||
|
assert "missing boundaries" in stderr
|
||||||
|
assert "boundary postcodes are absent" in stderr
|
||||||
|
|
|
||||||
358
pipeline/transform/crime_spatial.py
Normal file
358
pipeline/transform/crime_spatial.py
Normal file
|
|
@ -0,0 +1,358 @@
|
||||||
|
"""Aggregate police.uk street crime to postcodes by 50m spatial proximity.
|
||||||
|
|
||||||
|
Instead of attributing each incident to its published LSOA code, this transform
|
||||||
|
counts the anonymised incident *points* that fall within 50m of each postcode's
|
||||||
|
boundary polygon (the polygon buffered outward by 50m). A point inside several
|
||||||
|
overlapping buffers counts for each postcode -- the same multiplicity the
|
||||||
|
tree-density filter uses for features near more than one postcode.
|
||||||
|
|
||||||
|
The metric is a raw annualised count ("incidents/year within 50m"); there is no
|
||||||
|
per-capita denominator. Outputs mirror the old LSOA transform's shape but are
|
||||||
|
keyed on ``postcode`` instead of ``LSOA code``:
|
||||||
|
|
||||||
|
* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
|
||||||
|
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
|
||||||
|
nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
|
||||||
|
|
||||||
|
Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
|
||||||
|
points", not true locations, and a share of rows have no coordinate at all
|
||||||
|
(dropped here). Spatial totals are therefore lower than, and fuzzier than, the
|
||||||
|
old LSOA-tagged counts -- by design, not a regression.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import polars as pl
|
||||||
|
import shapely
|
||||||
|
from pyproj import Transformer
|
||||||
|
|
||||||
|
from pipeline.transform.crime import (
|
||||||
|
MINOR_CRIME_TYPES,
|
||||||
|
SERIOUS_CRIME_TYPES,
|
||||||
|
find_street_crime_csvs,
|
||||||
|
)
|
||||||
|
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
|
||||||
|
|
||||||
|
# Serious types first so column order is stable and self-documenting.
|
||||||
|
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
|
||||||
|
|
||||||
|
DEFAULT_BUFFER_M = 50.0
|
||||||
|
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
|
||||||
|
|
||||||
|
# Generous GB bounds; points outside fall in no English postcode anyway, but
|
||||||
|
# filtering first keeps the WGS84->BNG transform out of its undefined region.
|
||||||
|
LON_BOUNDS = (-9.5, 2.5)
|
||||||
|
LAT_BOUNDS = (49.0, 61.5)
|
||||||
|
|
||||||
|
# Read CSVs in chunks of files to bound peak memory while keeping the STRtree
|
||||||
|
# query vectorised over a useful number of points.
|
||||||
|
_CSV_BATCH = 64
|
||||||
|
|
||||||
|
|
||||||
|
def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
|
||||||
|
"""Derive annualisation denominators from the monthly directory names.
|
||||||
|
|
||||||
|
Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
|
||||||
|
month's incidents, so the set of month directories is the set of observed
|
||||||
|
months. Returns the sorted distinct years, months-observed-per-year, and the
|
||||||
|
total month count (the avg/yr denominator).
|
||||||
|
"""
|
||||||
|
months = sorted(
|
||||||
|
{path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
|
||||||
|
)
|
||||||
|
if not months:
|
||||||
|
raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
|
||||||
|
|
||||||
|
months_in_year: dict[int, int] = {}
|
||||||
|
for month in months:
|
||||||
|
year = int(month[:4])
|
||||||
|
months_in_year[year] = months_in_year.get(year, 0) + 1
|
||||||
|
|
||||||
|
years = sorted(months_in_year)
|
||||||
|
return years, months_in_year, len(months)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_tree(
|
||||||
|
polygons: np.ndarray, buffer_m: float
|
||||||
|
) -> tuple[np.ndarray, shapely.STRtree]:
|
||||||
|
"""Buffer postcode polygons outward by ``buffer_m`` and index them.
|
||||||
|
|
||||||
|
Buffer index == postcode index. Geometries that fail to buffer are replaced
|
||||||
|
with an empty polygon so the index stays aligned; they simply never match.
|
||||||
|
"""
|
||||||
|
buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
|
||||||
|
broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
|
||||||
|
if broken.any():
|
||||||
|
print(f" {int(broken.sum()):,} postcode buffers unusable; left empty")
|
||||||
|
buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
|
||||||
|
return buffers, shapely.STRtree(buffers)
|
||||||
|
|
||||||
|
|
||||||
|
def _accumulate_counts(
|
||||||
|
csvs: list[Path],
|
||||||
|
tree: shapely.STRtree,
|
||||||
|
type_to_idx: dict[str, int],
|
||||||
|
year_to_idx: dict[int, int],
|
||||||
|
transformer: Transformer,
|
||||||
|
counts: np.ndarray,
|
||||||
|
) -> None:
|
||||||
|
"""Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
|
||||||
|
schema = {
|
||||||
|
"Longitude": pl.Float64,
|
||||||
|
"Latitude": pl.Float64,
|
||||||
|
"Month": pl.Utf8,
|
||||||
|
"Crime type": pl.Utf8,
|
||||||
|
}
|
||||||
|
known_types = list(type_to_idx)
|
||||||
|
total_points = 0
|
||||||
|
total_matches = 0
|
||||||
|
total_dropped = 0
|
||||||
|
|
||||||
|
for start in range(0, len(csvs), _CSV_BATCH):
|
||||||
|
batch = csvs[start : start + _CSV_BATCH]
|
||||||
|
frame = (
|
||||||
|
pl.scan_csv(
|
||||||
|
batch,
|
||||||
|
schema_overrides=schema,
|
||||||
|
ignore_errors=True,
|
||||||
|
)
|
||||||
|
.select("Longitude", "Latitude", "Month", "Crime type")
|
||||||
|
.with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
|
||||||
|
.filter(
|
||||||
|
pl.col("Longitude").is_not_null()
|
||||||
|
& pl.col("Latitude").is_not_null()
|
||||||
|
& pl.col("Longitude").is_between(*LON_BOUNDS)
|
||||||
|
& pl.col("Latitude").is_between(*LAT_BOUNDS)
|
||||||
|
& pl.col("Crime type").is_in(known_types)
|
||||||
|
& pl.col("year").is_in(list(year_to_idx))
|
||||||
|
)
|
||||||
|
.with_columns(
|
||||||
|
pl.col("Crime type")
|
||||||
|
.replace_strict(type_to_idx, return_dtype=pl.Int32)
|
||||||
|
.alias("tidx"),
|
||||||
|
pl.col("year")
|
||||||
|
.replace_strict(year_to_idx, return_dtype=pl.Int32)
|
||||||
|
.alias("yidx"),
|
||||||
|
)
|
||||||
|
.select("Longitude", "Latitude", "tidx", "yidx")
|
||||||
|
.collect(engine="streaming")
|
||||||
|
)
|
||||||
|
|
||||||
|
rows_in = frame.height
|
||||||
|
if rows_in == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lon = frame["Longitude"].to_numpy()
|
||||||
|
lat = frame["Latitude"].to_numpy()
|
||||||
|
tidx = frame["tidx"].to_numpy()
|
||||||
|
yidx = frame["yidx"].to_numpy()
|
||||||
|
|
||||||
|
x, y = transformer.transform(lon, lat)
|
||||||
|
finite = np.isfinite(x) & np.isfinite(y)
|
||||||
|
total_dropped += int((~finite).sum())
|
||||||
|
if not finite.any():
|
||||||
|
continue
|
||||||
|
x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
|
||||||
|
total_points += x.size
|
||||||
|
|
||||||
|
points = shapely.points(x, y)
|
||||||
|
point_index, postcode_index = tree.query(points, predicate="intersects")
|
||||||
|
if point_index.size:
|
||||||
|
np.add.at(
|
||||||
|
counts,
|
||||||
|
(postcode_index, tidx[point_index], yidx[point_index]),
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
total_matches += point_index.size
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" files {start + len(batch):,}/{len(csvs):,}: "
|
||||||
|
f"{total_points:,} located points, {total_matches:,} postcode matches"
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_dropped:
|
||||||
|
print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
|
||||||
|
|
||||||
|
|
||||||
|
def _rollup_long(
|
||||||
|
long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
|
||||||
|
) -> pl.DataFrame:
|
||||||
|
"""Sum per-year annualised counts across ``types`` into a single rollup."""
|
||||||
|
return (
|
||||||
|
long.filter(pl.col("Crime type").is_in(list(types)))
|
||||||
|
.group_by("postcode", "year")
|
||||||
|
.agg(pl.col("count").sum().round(1).alias("count"))
|
||||||
|
.with_columns(pl.lit(rollup_name).alias("Crime type"))
|
||||||
|
.select("postcode", "Crime type", "year", "count")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_avg_yr(
|
||||||
|
postcodes: np.ndarray,
|
||||||
|
counts: np.ndarray,
|
||||||
|
valid_month_count: int,
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
"""Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
|
||||||
|
totals = counts.sum(axis=2) # (n_postcodes, n_types)
|
||||||
|
avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)
|
||||||
|
|
||||||
|
data: dict[str, np.ndarray] = {"postcode": postcodes}
|
||||||
|
for type_idx, name in enumerate(ALL_CRIME_TYPES):
|
||||||
|
data[f"{name} (avg/yr)"] = avg[:, type_idx]
|
||||||
|
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
|
||||||
|
print(f"Wrote postcode crime averages: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_by_year(
|
||||||
|
postcodes: np.ndarray,
|
||||||
|
counts: np.ndarray,
|
||||||
|
years: list[int],
|
||||||
|
months_in_year: dict[int, int],
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
|
||||||
|
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
|
||||||
|
annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)
|
||||||
|
|
||||||
|
pc_i, ty_i, yr_i = np.nonzero(counts)
|
||||||
|
if pc_i.size == 0:
|
||||||
|
raise ValueError("No crime points matched any postcode buffer")
|
||||||
|
|
||||||
|
type_names = np.array(ALL_CRIME_TYPES, dtype=object)
|
||||||
|
year_values = np.array(years, dtype=np.int32)
|
||||||
|
long = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"postcode": postcodes[pc_i],
|
||||||
|
"Crime type": type_names[ty_i],
|
||||||
|
"year": year_values[yr_i],
|
||||||
|
"count": annual[pc_i, ty_i, yr_i].astype(np.float32),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
|
||||||
|
minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
|
||||||
|
combined = pl.concat([long, serious, minor])
|
||||||
|
|
||||||
|
by_type = (
|
||||||
|
combined.sort("year")
|
||||||
|
.group_by("postcode", "Crime type")
|
||||||
|
.agg(pl.struct("year", "count").alias("series"))
|
||||||
|
)
|
||||||
|
wide = by_type.pivot(on="Crime type", index="postcode", values="series")
|
||||||
|
type_cols = [c for c in wide.columns if c != "postcode"]
|
||||||
|
wide = wide.rename({col: f"{col} (by year)" for col in type_cols})
|
||||||
|
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
wide.write_parquet(output_path, compression="zstd")
|
||||||
|
print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}")
|
||||||
|
|
||||||
|
|
||||||
|
def transform_crime_spatial(
|
||||||
|
crime_dir: Path,
|
||||||
|
boundaries_dir: Path,
|
||||||
|
output_path: Path,
|
||||||
|
by_year_output_path: Path,
|
||||||
|
buffer_m: float = DEFAULT_BUFFER_M,
|
||||||
|
max_postcodes: int | None = None,
|
||||||
|
max_files: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
|
||||||
|
if not csvs:
|
||||||
|
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
|
||||||
|
if max_files is not None:
|
||||||
|
csvs = csvs[:max_files]
|
||||||
|
|
||||||
|
years, months_in_year, valid_month_count = _month_calendar(csvs)
|
||||||
|
print(
|
||||||
|
f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
|
||||||
|
f"({years[0]}-{years[-1]})"
|
||||||
|
+ (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
|
||||||
|
print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
|
||||||
|
_buffers, tree = _build_tree(polygons, buffer_m)
|
||||||
|
|
||||||
|
type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
|
||||||
|
year_to_idx = {year: idx for idx, year in enumerate(years)}
|
||||||
|
counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
|
||||||
|
|
||||||
|
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||||
|
_accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
|
||||||
|
|
||||||
|
_write_avg_yr(postcodes, counts, valid_month_count, output_path)
|
||||||
|
_write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Count police.uk crime points within 50m of each postcode boundary"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=Path,
|
||||||
|
default=Path("property-data/crime"),
|
||||||
|
help="Directory containing police.uk street crime CSVs",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--boundaries",
|
||||||
|
type=Path,
|
||||||
|
default=Path("property-data/postcode_boundaries/units"),
|
||||||
|
help="Directory of per-district postcode boundary GeoJSONs",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Output parquet: postcode + '{type} (avg/yr)' columns",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-by-year",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Output parquet: postcode + nested '{type} (by year)' columns",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--buffer-m",
|
||||||
|
type=float,
|
||||||
|
default=DEFAULT_BUFFER_M,
|
||||||
|
help="Outward buffer (metres) added to each postcode boundary",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-postcodes",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Testing only: process the first N postcodes",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-files",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Testing only: process the first N monthly CSV files",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.buffer_m <= 0:
|
||||||
|
raise SystemExit("--buffer-m must be greater than zero")
|
||||||
|
|
||||||
|
transform_crime_spatial(
|
||||||
|
crime_dir=args.input,
|
||||||
|
boundaries_dir=args.boundaries,
|
||||||
|
output_path=args.output,
|
||||||
|
by_year_output_path=args.output_by_year,
|
||||||
|
buffer_m=args.buffer_m,
|
||||||
|
max_postcodes=args.max_postcodes,
|
||||||
|
max_files=args.max_files,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -26,6 +26,7 @@ MIN_PRICE = 50_000
|
||||||
EPC_SOURCE_COLUMNS = [
|
EPC_SOURCE_COLUMNS = [
|
||||||
"address",
|
"address",
|
||||||
"postcode",
|
"postcode",
|
||||||
|
"uprn",
|
||||||
"current_energy_rating",
|
"current_energy_rating",
|
||||||
"potential_energy_rating",
|
"potential_energy_rating",
|
||||||
"property_type",
|
"property_type",
|
||||||
|
|
@ -57,6 +58,8 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
||||||
raw.select(
|
raw.select(
|
||||||
_clean_string("address").alias("epc_address"),
|
_clean_string("address").alias("epc_address"),
|
||||||
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
|
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
|
||||||
|
# UPRN keys an exact listing->EPC join downstream (~99% populated).
|
||||||
|
_clean_string("uprn").alias("uprn"),
|
||||||
_clean_string("current_energy_rating")
|
_clean_string("current_energy_rating")
|
||||||
.str.to_uppercase()
|
.str.to_uppercase()
|
||||||
.alias("current_energy_rating"),
|
.alias("current_energy_rating"),
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ _AREA_COLUMNS = [
|
||||||
"lon",
|
"lon",
|
||||||
# Runtime provenance for deciding whether missing coordinates are skippable.
|
# Runtime provenance for deciding whether missing coordinates are skippable.
|
||||||
"ctry25cd",
|
"ctry25cd",
|
||||||
# Keyed lookup for postcode-level side tables (e.g. crime time series).
|
# Join key for LSOA-level side tables (e.g. median age).
|
||||||
"lsoa21",
|
"lsoa21",
|
||||||
# Deprivation
|
# Deprivation
|
||||||
"Income Score",
|
"Income Score",
|
||||||
|
|
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
|
||||||
"Other crime (avg/yr)",
|
"Other crime (avg/yr)",
|
||||||
"Serious crime (avg/yr)",
|
"Serious crime (avg/yr)",
|
||||||
"Minor crime (avg/yr)",
|
"Minor crime (avg/yr)",
|
||||||
"Serious crime per 1k residents (avg/yr)",
|
|
||||||
"Minor crime per 1k residents (avg/yr)",
|
|
||||||
# Amenities
|
# Amenities
|
||||||
"Number of restaurants within 2km",
|
"Number of restaurants within 2km",
|
||||||
"Number of grocery shops and supermarkets within 2km",
|
"Number of grocery shops and supermarkets within 2km",
|
||||||
|
|
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
|
||||||
"Other",
|
"Other",
|
||||||
]
|
]
|
||||||
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
||||||
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
# Listings are matched to EPC certificates and Price-Paid properties first by
|
||||||
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
|
||||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
|
# postcode. A house number in the listing address is the strong disambiguator,
|
||||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
|
# so a numbered listing may match on a lower street-similarity score than a
|
||||||
_PROPERTY_MATCH_MIN_MARGIN = 4.0
|
# number-less one (which must match the street almost exactly to be trusted).
|
||||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
|
||||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
|
||||||
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
|
|
||||||
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
|
|
||||||
_DIRECT_EPC_NEAREST_POSTCODES = 40
|
|
||||||
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
||||||
("_direct_epc_address", pl.Utf8),
|
("_direct_epc_address", pl.Utf8),
|
||||||
("_direct_current_energy_rating", pl.Utf8),
|
("_direct_current_energy_rating", pl.Utf8),
|
||||||
|
|
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
||||||
("_direct_was_council_house", pl.Utf8),
|
("_direct_was_council_house", pl.Utf8),
|
||||||
("_direct_epc_match_status", pl.Utf8),
|
("_direct_epc_match_status", pl.Utf8),
|
||||||
("_direct_epc_match_score", pl.Float32),
|
("_direct_epc_match_score", pl.Float32),
|
||||||
("_direct_epc_match_margin", pl.Float32),
|
("_direct_epc_match_method", pl.Utf8),
|
||||||
)
|
)
|
||||||
_DIRECT_EPC_RAW_COLUMN_MAP = {
|
_DIRECT_EPC_RAW_COLUMN_MAP = {
|
||||||
"epc_address": "_direct_epc_address",
|
"epc_address": "_direct_epc_address",
|
||||||
|
|
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _ratio_bonus(
|
|
||||||
left: float | int | None, right: float | int | None, pct: float, cap: float
|
|
||||||
) -> float:
|
|
||||||
if left is None or right is None:
|
|
||||||
return 0.0
|
|
||||||
try:
|
|
||||||
left_f = float(left)
|
|
||||||
right_f = float(right)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
return 0.0
|
|
||||||
if left_f <= 0 or right_f <= 0:
|
|
||||||
return 0.0
|
|
||||||
rel = abs(left_f - right_f) / max(left_f, right_f)
|
|
||||||
if rel > pct:
|
|
||||||
return 0.0
|
|
||||||
return cap * (1.0 - rel / pct)
|
|
||||||
|
|
||||||
|
|
||||||
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
|
||||||
if left is None or right is None:
|
|
||||||
return 0.0
|
|
||||||
try:
|
|
||||||
diff = abs(int(left) - int(right))
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
return 0.0
|
|
||||||
if diff == 0:
|
|
||||||
return 4.0
|
|
||||||
if diff == 1:
|
|
||||||
return 2.0
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
|
|
||||||
def _enum_bonus(
|
|
||||||
left: str | None, right: str | None, *, exact: float, mismatch: float
|
|
||||||
) -> float:
|
|
||||||
if not left or not right:
|
|
||||||
return 0.0
|
|
||||||
return exact if left == right else mismatch
|
|
||||||
|
|
||||||
|
|
||||||
def _address_score(query: str, candidate: str | None) -> int:
|
def _address_score(query: str, candidate: str | None) -> int:
|
||||||
if not candidate:
|
if not candidate:
|
||||||
return 0
|
return 0
|
||||||
|
|
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
|
||||||
return bool(address and _NUMBER_RE.search(address))
|
return bool(address and _NUMBER_RE.search(address))
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_uprn(value: object) -> str | None:
|
||||||
|
"""Canonical UPRN string (digits only) or None.
|
||||||
|
|
||||||
|
UPRNs arrive as strings or ints from the scraper / EPC register; normalise
|
||||||
|
so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
|
||||||
|
or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
|
||||||
|
rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
|
||||||
|
silently mangled into a bogus all-digits key.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, float):
|
||||||
|
if not value.is_integer():
|
||||||
|
return None
|
||||||
|
value = int(value)
|
||||||
|
digits = re.sub(r"\D", "", str(value))
|
||||||
|
return digits or None
|
||||||
|
|
||||||
|
|
||||||
|
def _best_listing_match(
|
||||||
|
listing_uprn: str | None,
|
||||||
|
query: str | None,
|
||||||
|
uprn_index: dict[str, dict],
|
||||||
|
bucket_candidates: list[dict],
|
||||||
|
addressed_fields: list[str],
|
||||||
|
) -> tuple[dict, float, str, str | None] | None:
|
||||||
|
"""Pick the best candidate for a listing.
|
||||||
|
|
||||||
|
Matching is, in order: (1) an exact UPRN equality against the global
|
||||||
|
``uprn_index`` (postcode-independent, so it is robust even when the
|
||||||
|
listing's postcode is slightly off); (2) failing that, the highest
|
||||||
|
fuzzy street-address similarity within the listing's own postcode bucket.
|
||||||
|
No property-attribute heuristics are used — a house number in the listing
|
||||||
|
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
|
||||||
|
threshold; a number-less address must match the street almost exactly.
|
||||||
|
|
||||||
|
``addressed_fields`` names the candidate columns to fuzzy-match against (a
|
||||||
|
candidate may carry both a register and an EPC address). Returns
|
||||||
|
``(candidate, score, method, matched_field)`` or None. ``method`` is
|
||||||
|
"uprn" or "address"; ``matched_field`` is the winning address column (or
|
||||||
|
None for a UPRN match).
|
||||||
|
"""
|
||||||
|
if listing_uprn:
|
||||||
|
hit = uprn_index.get(listing_uprn)
|
||||||
|
if hit is not None:
|
||||||
|
return hit, 100.0, "uprn", None
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return None
|
||||||
|
|
||||||
|
listing_has_numbers = _has_number(query)
|
||||||
|
best: dict | None = None
|
||||||
|
best_score = 0
|
||||||
|
best_field: str | None = None
|
||||||
|
for candidate in bucket_candidates:
|
||||||
|
for field in addressed_fields:
|
||||||
|
address = candidate.get(field)
|
||||||
|
if not address:
|
||||||
|
continue
|
||||||
|
if listing_has_numbers and not _numbers_compatible(query, address):
|
||||||
|
continue
|
||||||
|
score = _address_score(query, address)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best = candidate
|
||||||
|
best_field = field
|
||||||
|
|
||||||
|
if best is None:
|
||||||
|
return None
|
||||||
|
threshold = (
|
||||||
|
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||||
|
if listing_has_numbers
|
||||||
|
else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||||
|
)
|
||||||
|
if best_score < threshold:
|
||||||
|
return None
|
||||||
|
return best, float(best_score), "address", best_field
|
||||||
|
|
||||||
|
|
||||||
def _load_listings_for_merge(
|
def _load_listings_for_merge(
|
||||||
listings_path: Path, arcgis_path: Path
|
listings_path: Path, arcgis_path: Path
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
|
|
@ -908,6 +942,20 @@ def _load_listings_for_merge(
|
||||||
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
|
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
|
||||||
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
|
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
|
||||||
|
|
||||||
|
# UPRN is only present on scraped listings that carry it (Zoopla detail
|
||||||
|
# pages); tolerate its absence so older parquets and test fixtures still
|
||||||
|
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
||||||
|
if "UPRN" in raw.collect_schema().names():
|
||||||
|
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||||
|
listing_uprn_expr = (
|
||||||
|
pl.when(uprn_digits.str.len_chars() > 0)
|
||||||
|
.then(uprn_digits)
|
||||||
|
.otherwise(None)
|
||||||
|
.alias("_listing_uprn")
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
|
||||||
|
|
||||||
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
|
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
|
||||||
# treats NaN as distinct from null and the downstream `latest_price /
|
# treats NaN as distinct from null and the downstream `latest_price /
|
||||||
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
|
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
|
||||||
|
|
@ -936,12 +984,14 @@ def _load_listings_for_merge(
|
||||||
"postcode"
|
"postcode"
|
||||||
),
|
),
|
||||||
pl.col("Address per Property Register").alias("pp_address"),
|
pl.col("Address per Property Register").alias("pp_address"),
|
||||||
|
listing_uprn_expr,
|
||||||
*overlay,
|
*overlay,
|
||||||
)
|
)
|
||||||
.select(
|
.select(
|
||||||
"_listing_idx",
|
"_listing_idx",
|
||||||
"postcode",
|
"postcode",
|
||||||
"pp_address",
|
"pp_address",
|
||||||
|
"_listing_uprn",
|
||||||
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
|
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
|
||||||
)
|
)
|
||||||
.collect(engine="streaming")
|
.collect(engine="streaming")
|
||||||
|
|
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:
|
||||||
|
|
||||||
def _load_direct_epc_candidates(
|
def _load_direct_epc_candidates(
|
||||||
epc_path: Path,
|
epc_path: Path,
|
||||||
arcgis_path: Path,
|
|
||||||
listing_outcodes: list[str],
|
listing_outcodes: list[str],
|
||||||
temp_dir: Path,
|
temp_dir: Path,
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
|
|
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
|
||||||
"_direct_epc_match_postcode": pl.Utf8,
|
"_direct_epc_match_postcode": pl.Utf8,
|
||||||
"_direct_epc_outcode": pl.Utf8,
|
"_direct_epc_outcode": pl.Utf8,
|
||||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||||
"_direct_epc_east": pl.Float64,
|
"_direct_epc_uprn": pl.Utf8,
|
||||||
"_direct_epc_north": pl.Float64,
|
|
||||||
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
|
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
|
||||||
}
|
}
|
||||||
if not listing_outcodes:
|
if not listing_outcodes:
|
||||||
|
|
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
|
||||||
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
|
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
|
||||||
)
|
)
|
||||||
|
|
||||||
arcgis = pl.scan_parquet(arcgis_path).select(
|
|
||||||
normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
|
|
||||||
pl.col("east1m").alias("_direct_epc_east"),
|
|
||||||
pl.col("north1m").alias("_direct_epc_north"),
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
epc_base.sort("inspection_date", descending=True)
|
epc_base.sort("inspection_date", descending=True)
|
||||||
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
||||||
|
|
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
|
||||||
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
|
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
.join(arcgis, on="_direct_epc_match_postcode", how="left")
|
|
||||||
.with_columns(
|
.with_columns(
|
||||||
_canonical_epc_property_type_expr().alias(
|
_canonical_epc_property_type_expr().alias(
|
||||||
"_direct_epc_canonical_property_type"
|
"_direct_epc_canonical_property_type"
|
||||||
|
|
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
|
||||||
.otherwise(None)
|
.otherwise(None)
|
||||||
.alias("_direct_potential_energy_rating"),
|
.alias("_direct_potential_energy_rating"),
|
||||||
pl.col("epc_address").alias("_direct_epc_address"),
|
pl.col("epc_address").alias("_direct_epc_address"),
|
||||||
|
pl.col("uprn").alias("_direct_epc_uprn"),
|
||||||
pl.col("total_floor_area").alias("_direct_total_floor_area"),
|
pl.col("total_floor_area").alias("_direct_total_floor_area"),
|
||||||
pl.col("number_habitable_rooms").alias(
|
pl.col("number_habitable_rooms").alias(
|
||||||
"_direct_number_habitable_rooms"
|
"_direct_number_habitable_rooms"
|
||||||
|
|
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
|
||||||
"_direct_epc_match_postcode",
|
"_direct_epc_match_postcode",
|
||||||
"_direct_epc_outcode",
|
"_direct_epc_outcode",
|
||||||
"_direct_epc_canonical_property_type",
|
"_direct_epc_canonical_property_type",
|
||||||
"_direct_epc_east",
|
"_direct_epc_uprn",
|
||||||
"_direct_epc_north",
|
|
||||||
"_direct_epc_address",
|
"_direct_epc_address",
|
||||||
"_direct_current_energy_rating",
|
"_direct_current_energy_rating",
|
||||||
"_direct_potential_energy_rating",
|
"_direct_potential_energy_rating",
|
||||||
|
|
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(
|
||||||
|
|
||||||
|
|
||||||
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
||||||
match = listings.with_columns(
|
"""Add the normalised address/postcode/outcode keys used to match listings.
|
||||||
|
|
||||||
|
Listings are matched to EPC certificates and properties by UPRN and by
|
||||||
|
fuzzy street address within their (now accurate, detail-page-sourced)
|
||||||
|
postcode — never by coordinate proximity — so no projected easting/northing
|
||||||
|
is computed here. `_listing_uprn` flows through from the loaded listings.
|
||||||
|
"""
|
||||||
|
return listings.with_columns(
|
||||||
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
|
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
|
||||||
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
|
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
|
||||||
).with_columns(
|
).with_columns(
|
||||||
|
|
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
||||||
.alias("_listing_outcode")
|
.alias("_listing_outcode")
|
||||||
)
|
)
|
||||||
|
|
||||||
if match.is_empty():
|
|
||||||
return match.with_columns(
|
|
||||||
pl.Series("_listing_east", [], dtype=pl.Float64),
|
|
||||||
pl.Series("_listing_north", [], dtype=pl.Float64),
|
|
||||||
)
|
|
||||||
|
|
||||||
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
|
||||||
east, north = transformer.transform(
|
|
||||||
match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
|
|
||||||
)
|
|
||||||
return match.with_columns(
|
|
||||||
pl.Series("_listing_east", east, dtype=pl.Float64),
|
|
||||||
pl.Series("_listing_north", north, dtype=pl.Float64),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _optional_lazy_col(
|
def _optional_lazy_col(
|
||||||
schema: pl.Schema, column: str, dtype: pl.DataType
|
schema: pl.Schema, column: str, dtype: pl.DataType
|
||||||
|
|
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
|
||||||
"_matched_postcode": pl.Utf8,
|
"_matched_postcode": pl.Utf8,
|
||||||
"_matched_pp_address": pl.Utf8,
|
"_matched_pp_address": pl.Utf8,
|
||||||
"_property_match_score": pl.Float32,
|
"_property_match_score": pl.Float32,
|
||||||
"_property_match_address_score": pl.Int32,
|
"_property_match_method": pl.Utf8,
|
||||||
"_property_match_margin": pl.Float32,
|
|
||||||
"_property_match_field": pl.Utf8,
|
"_property_match_field": pl.Utf8,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
|
||||||
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
|
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
|
||||||
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
|
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
|
||||||
_optional_lazy_col(schema, "epc_address", pl.Utf8),
|
_optional_lazy_col(schema, "epc_address", pl.Utf8),
|
||||||
_optional_lazy_col(schema, "pp_property_type", pl.Utf8),
|
# UPRN keys the exact match; present once epc_pp is rebuilt with it.
|
||||||
_optional_lazy_col(schema, "duration", pl.Utf8),
|
_optional_lazy_col(schema, "uprn", pl.Utf8),
|
||||||
_optional_lazy_col(schema, "total_floor_area", pl.Float64),
|
|
||||||
_optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
|
|
||||||
_optional_lazy_col(schema, "latest_price", pl.Int64),
|
|
||||||
)
|
)
|
||||||
.with_row_index("_property_row")
|
.with_row_index("_property_row")
|
||||||
.with_columns(
|
.with_columns(
|
||||||
|
|
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _property_candidates_by_postcode(
|
def _index_candidates(
|
||||||
candidates: pl.DataFrame,
|
candidates: pl.DataFrame, postcode_key: str, uprn_key: str
|
||||||
) -> dict[str, list[dict]]:
|
) -> tuple[dict[str, list[dict]], dict[str, dict]]:
|
||||||
|
"""Index candidate rows for matching, in a single pass over the frame.
|
||||||
|
|
||||||
|
Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
|
||||||
|
fuzzy street-address match; the UPRN index drives the exact match and is
|
||||||
|
postcode-independent, so it still resolves when a listing's postcode is
|
||||||
|
slightly off.
|
||||||
|
"""
|
||||||
buckets: dict[str, list[dict]] = {}
|
buckets: dict[str, list[dict]] = {}
|
||||||
|
uprn_index: dict[str, dict] = {}
|
||||||
for row in candidates.iter_rows(named=True):
|
for row in candidates.iter_rows(named=True):
|
||||||
postcode = row.get("_property_match_postcode")
|
postcode = row.get(postcode_key)
|
||||||
if postcode:
|
if postcode:
|
||||||
buckets.setdefault(postcode, []).append(row)
|
buckets.setdefault(postcode, []).append(row)
|
||||||
return buckets
|
uprn = _normalize_uprn(row.get(uprn_key))
|
||||||
|
if uprn and uprn not in uprn_index:
|
||||||
|
uprn_index[uprn] = row
|
||||||
|
return buckets, uprn_index
|
||||||
|
|
||||||
|
|
||||||
def _best_listing_property_candidate(
|
def _best_listing_property_candidate(
|
||||||
listing: dict, candidates: list[dict]
|
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
query = listing.get("_listing_match_address")
|
result = _best_listing_match(
|
||||||
if not query:
|
listing.get("_listing_uprn"),
|
||||||
return None
|
listing.get("_listing_match_address"),
|
||||||
|
uprn_index,
|
||||||
listing_has_numbers = _has_number(query)
|
candidates,
|
||||||
scored: list[tuple[float, int, dict, str]] = []
|
["_property_match_address", "_property_epc_match_address"],
|
||||||
for candidate in candidates:
|
|
||||||
register_address = candidate.get("_property_match_address")
|
|
||||||
epc_address = candidate.get("_property_epc_match_address")
|
|
||||||
register_numbers_compatible = bool(
|
|
||||||
register_address and _numbers_compatible(query, register_address)
|
|
||||||
)
|
|
||||||
epc_numbers_compatible = bool(
|
|
||||||
epc_address and _numbers_compatible(query, epc_address)
|
|
||||||
)
|
|
||||||
if not (register_numbers_compatible or epc_numbers_compatible):
|
|
||||||
continue
|
|
||||||
|
|
||||||
register_score = _address_score(query, register_address)
|
|
||||||
epc_score = _address_score(query, epc_address)
|
|
||||||
base_score = max(register_score, epc_score)
|
|
||||||
if base_score == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
score = float(base_score)
|
|
||||||
score += _enum_bonus(
|
|
||||||
listing.get("_actual_property_type"),
|
|
||||||
candidate.get("pp_property_type"),
|
|
||||||
exact=7.0,
|
|
||||||
mismatch=-8.0,
|
|
||||||
)
|
|
||||||
score += _enum_bonus(
|
|
||||||
listing.get("_actual_leasehold_freehold"),
|
|
||||||
candidate.get("duration"),
|
|
||||||
exact=3.0,
|
|
||||||
mismatch=-3.0,
|
|
||||||
)
|
|
||||||
score += _ratio_bonus(
|
|
||||||
listing.get("_actual_total_floor_area"),
|
|
||||||
candidate.get("total_floor_area"),
|
|
||||||
pct=0.15,
|
|
||||||
cap=8.0,
|
|
||||||
)
|
|
||||||
score += _rooms_bonus(
|
|
||||||
listing.get("_actual_number_habitable_rooms"),
|
|
||||||
candidate.get("number_habitable_rooms"),
|
|
||||||
)
|
|
||||||
score += _ratio_bonus(
|
|
||||||
listing.get("_actual_asking_price"),
|
|
||||||
candidate.get("latest_price"),
|
|
||||||
pct=0.25,
|
|
||||||
cap=3.0,
|
|
||||||
)
|
|
||||||
matched_field = (
|
|
||||||
"pp_address" if register_score >= epc_score else "epc_address"
|
|
||||||
)
|
|
||||||
scored.append((score, base_score, candidate, matched_field))
|
|
||||||
|
|
||||||
if not scored:
|
|
||||||
return None
|
|
||||||
scored.sort(key=lambda item: item[0], reverse=True)
|
|
||||||
top = scored[0]
|
|
||||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
||||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
||||||
score_threshold = (
|
|
||||||
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
||||||
if listing_has_numbers
|
|
||||||
else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
||||||
)
|
)
|
||||||
address_threshold = (
|
if result is None:
|
||||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
|
|
||||||
if listing_has_numbers
|
|
||||||
else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
top[0] < score_threshold
|
|
||||||
or top[1] < address_threshold
|
|
||||||
or margin < _PROPERTY_MATCH_MIN_MARGIN
|
|
||||||
):
|
|
||||||
return None
|
return None
|
||||||
|
candidate, score, method, field = result
|
||||||
candidate = top[2]
|
matched_field = {
|
||||||
|
"_property_match_address": "pp_address",
|
||||||
|
"_property_epc_match_address": "epc_address",
|
||||||
|
}.get(field, method)
|
||||||
return {
|
return {
|
||||||
"_listing_idx": listing["_listing_idx"],
|
"_listing_idx": listing["_listing_idx"],
|
||||||
"_matched_postcode": candidate.get("postcode"),
|
"_matched_postcode": candidate.get("postcode"),
|
||||||
"_matched_pp_address": candidate.get("pp_address"),
|
"_matched_pp_address": candidate.get("pp_address"),
|
||||||
"_property_match_score": round(top[0], 1),
|
"_property_match_score": round(score, 1),
|
||||||
"_property_match_address_score": top[1],
|
"_property_match_method": method,
|
||||||
"_property_match_margin": round(margin, 1),
|
"_property_match_field": matched_field,
|
||||||
"_property_match_field": top[3],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1280,23 +1251,32 @@ def _match_listing_properties(
|
||||||
if listing_matches.is_empty() or property_candidates.is_empty():
|
if listing_matches.is_empty() or property_candidates.is_empty():
|
||||||
return _empty_listing_property_matches()
|
return _empty_listing_property_matches()
|
||||||
|
|
||||||
buckets = _property_candidates_by_postcode(property_candidates)
|
buckets, uprn_index = _index_candidates(
|
||||||
|
property_candidates, "_property_match_postcode", "uprn"
|
||||||
|
)
|
||||||
best_matches = []
|
best_matches = []
|
||||||
for listing in listing_matches.iter_rows(named=True):
|
for listing in listing_matches.iter_rows(named=True):
|
||||||
postcode = listing.get("_listing_match_postcode")
|
postcode = listing.get("_listing_match_postcode")
|
||||||
if not postcode:
|
bucket = buckets.get(postcode, []) if postcode else []
|
||||||
continue
|
match = _best_listing_property_candidate(listing, uprn_index, bucket)
|
||||||
match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
|
|
||||||
if match is not None:
|
if match is not None:
|
||||||
best_matches.append(match)
|
best_matches.append(match)
|
||||||
|
|
||||||
if not best_matches:
|
if not best_matches:
|
||||||
return _empty_listing_property_matches()
|
return _empty_listing_property_matches()
|
||||||
|
|
||||||
|
# When two listings claim the same property, keep the most authoritative
|
||||||
|
# match: an exact UPRN match always wins over a fuzzy address match (both can
|
||||||
|
# score 100, so method must break the tie before score and listing index).
|
||||||
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
|
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
|
||||||
return (
|
return (
|
||||||
matches.sort(
|
matches.sort(
|
||||||
["_property_match_score", "_listing_idx"], descending=[True, False]
|
[
|
||||||
|
pl.col("_property_match_method") == "uprn",
|
||||||
|
"_property_match_score",
|
||||||
|
"_listing_idx",
|
||||||
|
],
|
||||||
|
descending=[True, True, False],
|
||||||
)
|
)
|
||||||
.unique(
|
.unique(
|
||||||
["_matched_postcode", "_matched_pp_address"],
|
["_matched_postcode", "_matched_pp_address"],
|
||||||
|
|
@ -1307,133 +1287,19 @@ def _match_listing_properties(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
|
def _best_direct_epc_candidate(
|
||||||
buckets: dict[str, list[dict]] = {}
|
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
|
||||||
for row in candidates.iter_rows(named=True):
|
) -> dict | None:
|
||||||
postcode = row.get("_direct_epc_match_postcode")
|
result = _best_listing_match(
|
||||||
if postcode:
|
listing.get("_listing_uprn"),
|
||||||
buckets.setdefault(postcode, []).append(row)
|
listing.get("_listing_match_address"),
|
||||||
return buckets
|
uprn_index,
|
||||||
|
candidates,
|
||||||
|
["_direct_epc_match_address"],
|
||||||
def _epc_postcode_tree(
|
|
||||||
candidates: pl.DataFrame,
|
|
||||||
) -> tuple[cKDTree | None, list[str]]:
|
|
||||||
postcode_points = (
|
|
||||||
candidates.select(
|
|
||||||
"_direct_epc_match_postcode",
|
|
||||||
"_direct_epc_east",
|
|
||||||
"_direct_epc_north",
|
|
||||||
)
|
|
||||||
.drop_nulls()
|
|
||||||
.filter(
|
|
||||||
pl.col("_direct_epc_east").is_finite()
|
|
||||||
& pl.col("_direct_epc_north").is_finite()
|
|
||||||
)
|
|
||||||
.unique("_direct_epc_match_postcode")
|
|
||||||
)
|
)
|
||||||
if postcode_points.is_empty():
|
if result is None:
|
||||||
return None, []
|
|
||||||
coords = np.column_stack(
|
|
||||||
[
|
|
||||||
postcode_points["_direct_epc_east"].to_numpy(),
|
|
||||||
postcode_points["_direct_epc_north"].to_numpy(),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
|
|
||||||
|
|
||||||
|
|
||||||
def _candidate_postcodes_for_listing(
|
|
||||||
listing: dict,
|
|
||||||
postcode_tree: cKDTree | None,
|
|
||||||
postcode_values: list[str],
|
|
||||||
) -> list[str]:
|
|
||||||
postcodes: list[str] = []
|
|
||||||
exact = listing.get("_listing_match_postcode")
|
|
||||||
if exact:
|
|
||||||
postcodes.append(exact)
|
|
||||||
|
|
||||||
if postcode_tree is None:
|
|
||||||
return postcodes
|
|
||||||
|
|
||||||
east = listing.get("_listing_east")
|
|
||||||
north = listing.get("_listing_north")
|
|
||||||
try:
|
|
||||||
east_f = float(east)
|
|
||||||
north_f = float(north)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
return postcodes
|
|
||||||
if not np.isfinite(east_f) or not np.isfinite(north_f):
|
|
||||||
return postcodes
|
|
||||||
|
|
||||||
k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
|
|
||||||
distances, indices = postcode_tree.query(
|
|
||||||
[east_f, north_f],
|
|
||||||
k=k,
|
|
||||||
distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
|
|
||||||
)
|
|
||||||
distances = np.atleast_1d(distances)
|
|
||||||
indices = np.atleast_1d(indices)
|
|
||||||
seen = set(postcodes)
|
|
||||||
for distance, idx in zip(distances, indices, strict=False):
|
|
||||||
if not np.isfinite(distance) or idx >= len(postcode_values):
|
|
||||||
continue
|
|
||||||
postcode = postcode_values[int(idx)]
|
|
||||||
if postcode not in seen:
|
|
||||||
postcodes.append(postcode)
|
|
||||||
seen.add(postcode)
|
|
||||||
return postcodes
|
|
||||||
|
|
||||||
|
|
||||||
def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
|
||||||
query = listing.get("_listing_match_address")
|
|
||||||
if not query:
|
|
||||||
return None
|
return None
|
||||||
|
candidate, score, method, _field = result
|
||||||
listing_has_numbers = _has_number(query)
|
|
||||||
scored: list[tuple[float, int, dict]] = []
|
|
||||||
for candidate in candidates:
|
|
||||||
address = candidate.get("_direct_epc_match_address")
|
|
||||||
if listing_has_numbers and not _numbers_compatible(query, address or ""):
|
|
||||||
continue
|
|
||||||
base_score = _address_score(query, address)
|
|
||||||
if base_score == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
score = float(base_score)
|
|
||||||
score += _enum_bonus(
|
|
||||||
listing.get("_actual_property_type"),
|
|
||||||
candidate.get("_direct_epc_canonical_property_type"),
|
|
||||||
exact=6.0,
|
|
||||||
mismatch=-6.0,
|
|
||||||
)
|
|
||||||
score += _ratio_bonus(
|
|
||||||
listing.get("_actual_total_floor_area"),
|
|
||||||
candidate.get("_direct_total_floor_area"),
|
|
||||||
pct=0.12,
|
|
||||||
cap=8.0,
|
|
||||||
)
|
|
||||||
score += _rooms_bonus(
|
|
||||||
listing.get("_actual_number_habitable_rooms"),
|
|
||||||
candidate.get("_direct_number_habitable_rooms"),
|
|
||||||
)
|
|
||||||
scored.append((score, base_score, candidate))
|
|
||||||
|
|
||||||
if not scored:
|
|
||||||
return None
|
|
||||||
scored.sort(key=lambda item: item[0], reverse=True)
|
|
||||||
top = scored[0]
|
|
||||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
||||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
||||||
threshold = (
|
|
||||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
||||||
if listing_has_numbers
|
|
||||||
else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
||||||
)
|
|
||||||
if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
|
|
||||||
return None
|
|
||||||
|
|
||||||
candidate = top[2]
|
|
||||||
return {
|
return {
|
||||||
"_listing_idx": listing["_listing_idx"],
|
"_listing_idx": listing["_listing_idx"],
|
||||||
"_direct_epc_address": candidate.get("_direct_epc_address"),
|
"_direct_epc_address": candidate.get("_direct_epc_address"),
|
||||||
|
|
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
|
||||||
),
|
),
|
||||||
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
|
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
|
||||||
"_direct_epc_match_status": "matched",
|
"_direct_epc_match_status": "matched",
|
||||||
"_direct_epc_match_score": round(top[0], 1),
|
"_direct_epc_match_score": round(score, 1),
|
||||||
"_direct_epc_match_margin": round(margin, 1),
|
"_direct_epc_match_method": method,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1463,25 +1329,14 @@ def _match_direct_epc(
|
||||||
if listing_matches.is_empty() or epc_candidates.is_empty():
|
if listing_matches.is_empty() or epc_candidates.is_empty():
|
||||||
return _empty_direct_epc_matches()
|
return _empty_direct_epc_matches()
|
||||||
|
|
||||||
buckets = _epc_candidates_by_postcode(epc_candidates)
|
buckets, uprn_index = _index_candidates(
|
||||||
postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
|
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
|
||||||
|
)
|
||||||
matches = []
|
matches = []
|
||||||
for listing in listing_matches.iter_rows(named=True):
|
for listing in listing_matches.iter_rows(named=True):
|
||||||
candidate_postcodes = _candidate_postcodes_for_listing(
|
postcode = listing.get("_listing_match_postcode")
|
||||||
listing, postcode_tree, postcode_values
|
bucket = buckets.get(postcode, []) if postcode else []
|
||||||
)
|
match = _best_direct_epc_candidate(listing, uprn_index, bucket)
|
||||||
candidate_rows: list[dict] = []
|
|
||||||
seen_rows: set[int] = set()
|
|
||||||
for postcode in candidate_postcodes:
|
|
||||||
for candidate in buckets.get(postcode, []):
|
|
||||||
row = candidate.get("_direct_epc_row")
|
|
||||||
if row in seen_rows:
|
|
||||||
continue
|
|
||||||
candidate_rows.append(candidate)
|
|
||||||
if row is not None:
|
|
||||||
seen_rows.add(row)
|
|
||||||
match = _best_direct_epc_candidate(listing, candidate_rows)
|
|
||||||
if match is not None:
|
if match is not None:
|
||||||
matches.append(match)
|
matches.append(match)
|
||||||
|
|
||||||
|
|
@ -1493,7 +1348,6 @@ def _match_direct_epc(
|
||||||
def _enrich_listings_with_direct_epc(
|
def _enrich_listings_with_direct_epc(
|
||||||
listings: pl.DataFrame,
|
listings: pl.DataFrame,
|
||||||
epc_path: Path | None,
|
epc_path: Path | None,
|
||||||
arcgis_path: Path,
|
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
if epc_path is None:
|
if epc_path is None:
|
||||||
return _ensure_direct_epc_columns(listings)
|
return _ensure_direct_epc_columns(listings)
|
||||||
|
|
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
|
||||||
prefix="direct_listing_epc_", dir=local_tmp_dir()
|
prefix="direct_listing_epc_", dir=local_tmp_dir()
|
||||||
) as tmpdir:
|
) as tmpdir:
|
||||||
epc_candidates = _load_direct_epc_candidates(
|
epc_candidates = _load_direct_epc_candidates(
|
||||||
epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
|
epc_path, listing_outcodes, Path(tmpdir)
|
||||||
)
|
)
|
||||||
print(f"Direct listing EPC candidates: {epc_candidates.height}")
|
print(f"Direct listing EPC candidates: {epc_candidates.height}")
|
||||||
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
|
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
|
||||||
|
|
@ -1604,7 +1458,7 @@ def _integrate_listings(
|
||||||
"""
|
"""
|
||||||
listings = _load_listings_for_merge(listings_path, arcgis_path)
|
listings = _load_listings_for_merge(listings_path, arcgis_path)
|
||||||
print(f"Listings loaded: {listings.height}")
|
print(f"Listings loaded: {listings.height}")
|
||||||
listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
|
listings = _enrich_listings_with_direct_epc(listings, epc_path)
|
||||||
|
|
||||||
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
|
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
|
||||||
listing_attachment_columns = [
|
listing_attachment_columns = [
|
||||||
|
|
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
"""Project the post-rename wide frame down to enriched-listing rows."""
|
"""Project the post-rename wide frame down to enriched-listing rows."""
|
||||||
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
|
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
|
||||||
|
|
||||||
|
# A matched listing's overlay attaches to every wide row sharing its
|
||||||
|
# (postcode, pp_address). The terminated-postcode remap can collapse several
|
||||||
|
# distinct wide rows onto one such key, which would otherwise emit one duplicate
|
||||||
|
# listing per collapsed row. Each listing matches exactly one (postcode,
|
||||||
|
# pp_address) and each seed row carries a unique URL, so keeping a single row per
|
||||||
|
# listing URL collapses only that fan-out and never merges distinct listings.
|
||||||
|
df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
|
||||||
|
|
||||||
df = df.with_columns(
|
df = df.with_columns(
|
||||||
pl.col("_actual_listing_url").alias("Listing URL"),
|
pl.col("_actual_listing_url").alias("Listing URL"),
|
||||||
pl.col("_actual_listing_date").alias("Listing date"),
|
pl.col("_actual_listing_date").alias("Listing date"),
|
||||||
|
|
@ -1750,7 +1612,6 @@ def _build(
|
||||||
broadband_path: Path,
|
broadband_path: Path,
|
||||||
conservation_areas_path: Path,
|
conservation_areas_path: Path,
|
||||||
rental_prices_path: Path,
|
rental_prices_path: Path,
|
||||||
lsoa_population_path: Path,
|
|
||||||
median_age_path: Path,
|
median_age_path: Path,
|
||||||
election_results_path: Path,
|
election_results_path: Path,
|
||||||
tree_density_postcodes_path: Path | None = None,
|
tree_density_postcodes_path: Path | None = None,
|
||||||
|
|
@ -1881,8 +1742,10 @@ def _build(
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||||
|
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||||
crime = pl.scan_parquet(crime_path)
|
crime = pl.scan_parquet(crime_path)
|
||||||
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
wide = wide.join(crime, on="postcode", how="left")
|
||||||
|
|
||||||
wide = wide.with_columns(
|
wide = wide.with_columns(
|
||||||
pl.sum_horizontal(
|
pl.sum_horizontal(
|
||||||
|
|
@ -1905,17 +1768,6 @@ def _build(
|
||||||
).alias("minor_crime_avg_yr"),
|
).alias("minor_crime_avg_yr"),
|
||||||
)
|
)
|
||||||
|
|
||||||
lsoa_pop = pl.scan_parquet(lsoa_population_path)
|
|
||||||
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
|
|
||||||
wide = wide.with_columns(
|
|
||||||
pl.when(pl.col("population") > 0)
|
|
||||||
.then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
|
||||||
.alias("serious_crime_per_1k"),
|
|
||||||
pl.when(pl.col("population") > 0)
|
|
||||||
.then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
|
||||||
.alias("minor_crime_per_1k"),
|
|
||||||
).drop("population")
|
|
||||||
|
|
||||||
median_age = pl.scan_parquet(median_age_path)
|
median_age = pl.scan_parquet(median_age_path)
|
||||||
wide = wide.join(median_age, on="lsoa21", how="left")
|
wide = wide.join(median_age, on="lsoa21", how="left")
|
||||||
|
|
||||||
|
|
@ -2082,8 +1934,6 @@ def _build(
|
||||||
"max_download_speed": "Max available download speed (Mbps)",
|
"max_download_speed": "Max available download speed (Mbps)",
|
||||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||||
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
|
|
||||||
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
|
|
||||||
"mean_monthly_rent": "Estimated monthly rent",
|
"mean_monthly_rent": "Estimated monthly rent",
|
||||||
"floor_height": "Interior height (m)",
|
"floor_height": "Interior height (m)",
|
||||||
"was_council_house": "Former council house",
|
"was_council_house": "Former council house",
|
||||||
|
|
@ -2189,12 +2039,6 @@ def main():
|
||||||
required=True,
|
required=True,
|
||||||
help="ONS rental prices by LA and bedroom count parquet file",
|
help="ONS rental prices by LA and bedroom count parquet file",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--lsoa-population",
|
|
||||||
type=Path,
|
|
||||||
required=True,
|
|
||||||
help="Census 2021 population by LSOA parquet file",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--median-age",
|
"--median-age",
|
||||||
type=Path,
|
type=Path,
|
||||||
|
|
@ -2279,7 +2123,6 @@ def main():
|
||||||
broadband_path=args.broadband,
|
broadband_path=args.broadband,
|
||||||
conservation_areas_path=args.conservation_areas,
|
conservation_areas_path=args.conservation_areas,
|
||||||
rental_prices_path=args.rental_prices,
|
rental_prices_path=args.rental_prices,
|
||||||
lsoa_population_path=args.lsoa_population,
|
|
||||||
median_age_path=args.median_age,
|
median_age_path=args.median_age,
|
||||||
election_results_path=args.election_results,
|
election_results_path=args.election_results,
|
||||||
tree_density_postcodes_path=args.tree_density_postcodes,
|
tree_density_postcodes_path=args.tree_density_postcodes,
|
||||||
|
|
|
||||||
|
|
@ -376,7 +376,7 @@ def main() -> None:
|
||||||
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
|
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
|
||||||
)
|
)
|
||||||
parser.add_argument("--pmtiles-version", default="1.22.3")
|
parser.add_argument("--pmtiles-version", default="1.22.3")
|
||||||
parser.add_argument("--min-zoom", type=int, default=13)
|
parser.add_argument("--min-zoom", type=int, default=12)
|
||||||
parser.add_argument("--max-zoom", type=int, default=14)
|
parser.add_argument("--max-zoom", type=int, default=14)
|
||||||
parser.add_argument("--tile-size", type=int, default=256)
|
parser.add_argument("--tile-size", type=int, default=256)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,12 @@ def main() -> None:
|
||||||
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
|
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
|
||||||
)
|
)
|
||||||
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
|
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
|
||||||
|
parser.add_argument(
|
||||||
|
"--arcgis",
|
||||||
|
type=Path,
|
||||||
|
default=None,
|
||||||
|
help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
|
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
|
||||||
)
|
)
|
||||||
|
|
@ -46,7 +52,7 @@ def main() -> None:
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||||
uprn_df, uprn_offsets = load_uprns(args.uprn)
|
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||||
|
|
||||||
# Phase 2: Parse/load INSPIRE
|
# Phase 2: Parse/load INSPIRE
|
||||||
print()
|
print()
|
||||||
|
|
|
||||||
105
pipeline/transform/postcode_boundaries/loader.py
Normal file
105
pipeline/transform/postcode_boundaries/loader.py
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
|
||||||
|
|
||||||
|
The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
|
||||||
|
postcode district under ``units/{district}.geojson``, each feature carrying a
|
||||||
|
``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
|
||||||
|
that test points against postcode geometry want those polygons back in British
|
||||||
|
National Grid (EPSG:27700) so buffers/distances are in metres.
|
||||||
|
|
||||||
|
:func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs
|
||||||
|
invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
|
||||||
|
postcode so callers can use the array index as a stable postcode id -- the same
|
||||||
|
"buffer index == postcode index" convention used by ``tree_density``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import shapely
|
||||||
|
from pyproj import Transformer
|
||||||
|
|
||||||
|
|
||||||
|
def _read_district(
|
||||||
|
path: Path, transformer: Transformer
|
||||||
|
) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Return (postcodes, polygons_27700) for one district GeoJSON."""
|
||||||
|
with path.open() as file:
|
||||||
|
collection = json.load(file)
|
||||||
|
|
||||||
|
features = collection.get("features", [])
|
||||||
|
if not features:
|
||||||
|
return np.empty(0, dtype=object), np.empty(0, dtype=object)
|
||||||
|
|
||||||
|
postcodes = np.array(
|
||||||
|
[feature["properties"]["postcodes"] for feature in features], dtype=object
|
||||||
|
)
|
||||||
|
geom_json = np.array(
|
||||||
|
[json.dumps(feature["geometry"]) for feature in features], dtype=object
|
||||||
|
)
|
||||||
|
geoms = shapely.from_geojson(geom_json)
|
||||||
|
|
||||||
|
# Reproject every vertex in a single pyproj call, then rebuild the polygons.
|
||||||
|
coords = shapely.get_coordinates(geoms)
|
||||||
|
if coords.size:
|
||||||
|
x, y = transformer.transform(coords[:, 0], coords[:, 1])
|
||||||
|
geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
|
||||||
|
|
||||||
|
invalid = ~shapely.is_valid(geoms)
|
||||||
|
if invalid.any():
|
||||||
|
geoms[invalid] = shapely.make_valid(geoms[invalid])
|
||||||
|
|
||||||
|
return postcodes, geoms
|
||||||
|
|
||||||
|
|
||||||
|
def load_postcode_polygons(
|
||||||
|
units_dir: Path, max_postcodes: int | None = None
|
||||||
|
) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
|
||||||
|
|
||||||
|
Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
|
||||||
|
``max_postcodes`` (testing) keeps only the lexicographically-first N
|
||||||
|
postcodes, reading just enough district files to reach the cap.
|
||||||
|
"""
|
||||||
|
units_dir = Path(units_dir)
|
||||||
|
files = sorted(units_dir.glob("*.geojson"))
|
||||||
|
if not files:
|
||||||
|
raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
|
||||||
|
|
||||||
|
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||||
|
postcode_chunks: list[np.ndarray] = []
|
||||||
|
geom_chunks: list[np.ndarray] = []
|
||||||
|
total = 0
|
||||||
|
for path in files:
|
||||||
|
postcodes, geoms = _read_district(path, transformer)
|
||||||
|
if len(postcodes) == 0:
|
||||||
|
continue
|
||||||
|
postcode_chunks.append(postcodes)
|
||||||
|
geom_chunks.append(geoms)
|
||||||
|
total += len(postcodes)
|
||||||
|
if max_postcodes is not None and total >= max_postcodes:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not postcode_chunks:
|
||||||
|
raise ValueError(f"No postcode features found in {units_dir}")
|
||||||
|
|
||||||
|
postcodes = np.concatenate(postcode_chunks)
|
||||||
|
geoms = np.concatenate(geom_chunks)
|
||||||
|
|
||||||
|
# Stable postcode order makes "index == postcode id" deterministic; dedupe
|
||||||
|
# defensively (a postcode lives in exactly one district file).
|
||||||
|
order = np.argsort(postcodes, kind="stable")
|
||||||
|
postcodes = postcodes[order]
|
||||||
|
geoms = geoms[order]
|
||||||
|
_, first = np.unique(postcodes, return_index=True)
|
||||||
|
postcodes = postcodes[first]
|
||||||
|
geoms = geoms[first]
|
||||||
|
|
||||||
|
if max_postcodes is not None and len(postcodes) > max_postcodes:
|
||||||
|
postcodes = postcodes[:max_postcodes]
|
||||||
|
geoms = geoms[:max_postcodes]
|
||||||
|
|
||||||
|
print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
|
||||||
|
return postcodes, geoms
|
||||||
|
|
@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
|
||||||
loaded_df, _ = load_uprns(path)
|
loaded_df, _ = load_uprns(path)
|
||||||
assert len(loaded_df) == 0
|
assert len(loaded_df) == 0
|
||||||
|
|
||||||
|
def test_non_english_oas_excluded(self, tmp_path):
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"GRIDGB1E": [500010, 300010],
|
||||||
|
"GRIDGB1N": [180010, 220010],
|
||||||
|
"PCDS": ["AA1 1AA", "CF1 1AA"],
|
||||||
|
"OA21CD": ["E00000001", "W00000001"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
path = tmp_path / "uprn.parquet"
|
||||||
|
df.write_parquet(path)
|
||||||
|
|
||||||
|
loaded_df, offsets = load_uprns(path)
|
||||||
|
|
||||||
|
assert set(offsets) == {"E00000001"}
|
||||||
|
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
|
||||||
|
|
||||||
|
def test_terminated_postcodes_are_remapped(self, tmp_path):
|
||||||
|
uprns = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"GRIDGB1E": [500010],
|
||||||
|
"GRIDGB1N": [180010],
|
||||||
|
"PCDS": ["aa1 1aa"],
|
||||||
|
"OA21CD": ["E00000001"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
uprn_path = tmp_path / "uprn.parquet"
|
||||||
|
uprns.write_parquet(uprn_path)
|
||||||
|
arcgis = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"pcds": ["AA1 1AA", "AA1 1AB"],
|
||||||
|
"east1m": [500010, 500030],
|
||||||
|
"north1m": [180010, 180020],
|
||||||
|
"doterm": ["2020-01-01", None],
|
||||||
|
"ctry25cd": ["E92000001", "E92000001"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
arcgis_path = tmp_path / "arcgis.parquet"
|
||||||
|
arcgis.write_parquet(arcgis_path)
|
||||||
|
|
||||||
|
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
|
||||||
|
|
||||||
|
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Bug 3: Voronoi deduplication is first-seen-wins
|
# Bug 3: Voronoi deduplication is first-seen-wins
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,18 @@ import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from pipeline.local_temp import local_tmp_dir
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||||
|
|
||||||
from .memory import release_memory
|
from .memory import release_memory
|
||||||
|
|
||||||
|
|
||||||
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
def _canonical_postcode_expr(name: str) -> pl.Expr:
|
||||||
|
return pl.col(name).str.strip_chars().str.to_uppercase()
|
||||||
|
|
||||||
|
|
||||||
|
def load_uprns(
|
||||||
|
uprn_path: Path, arcgis_path: Path | None = None
|
||||||
|
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||||
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
||||||
|
|
||||||
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
||||||
|
|
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
print("Loading UPRN lookup...")
|
print("Loading UPRN lookup...")
|
||||||
|
mapping = None
|
||||||
|
if arcgis_path is not None:
|
||||||
|
mapping = (
|
||||||
|
build_postcode_mapping(arcgis_path)
|
||||||
|
.with_columns(
|
||||||
|
_canonical_postcode_expr("old_postcode").alias("old_postcode"),
|
||||||
|
_canonical_postcode_expr("new_postcode").alias("new_postcode"),
|
||||||
|
)
|
||||||
|
.unique("old_postcode")
|
||||||
|
)
|
||||||
|
|
||||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
suffix=".parquet", delete=False, dir=local_tmp_dir()
|
suffix=".parquet", delete=False, dir=local_tmp_dir()
|
||||||
) as tmp:
|
) as tmp:
|
||||||
tmp_path = Path(tmp.name)
|
tmp_path = Path(tmp.name)
|
||||||
(
|
uprns = (
|
||||||
pl.scan_parquet(uprn_path)
|
pl.scan_parquet(uprn_path)
|
||||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||||
.filter(~pl.col("OA21CD").str.starts_with("S"))
|
.filter(pl.col("OA21CD").str.starts_with("E"))
|
||||||
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
||||||
.with_columns(pl.col("PCDS").str.strip_chars())
|
.with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
|
||||||
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
||||||
.sort("OA21CD")
|
|
||||||
.sink_parquet(tmp_path)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if mapping is not None and mapping.height > 0:
|
||||||
|
uprns = (
|
||||||
|
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
|
||||||
|
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||||
|
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||||
|
)
|
||||||
|
|
||||||
|
uprns.sort("OA21CD").sink_parquet(tmp_path)
|
||||||
release_memory()
|
release_memory()
|
||||||
|
|
||||||
# Read the sorted data — only one copy in memory (~2GB)
|
# Read the sorted data — only one copy in memory (~2GB)
|
||||||
df = pl.read_parquet(tmp_path)
|
df = pl.read_parquet(tmp_path)
|
||||||
tmp_path.unlink()
|
tmp_path.unlink()
|
||||||
n = len(df)
|
n = len(df)
|
||||||
print(f" Loaded {n:,} UPRNs (England & Wales)")
|
print(f" Loaded {n:,} UPRNs (England)")
|
||||||
|
|
||||||
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
||||||
boundary_df = (
|
boundary_df = (
|
||||||
|
|
|
||||||
138
pipeline/transform/property_border_tiles.py
Normal file
138
pipeline/transform/property_border_tiles.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
"""Build PMTiles polygon tiles for the INSPIRE property-border overlay.
|
||||||
|
|
||||||
|
Reads the HM Land Registry INSPIRE Index Polygons (per-local-authority GML ZIPs
|
||||||
|
in EPSG:27700), reprojects each parcel to WGS84, and tiles the outlines with
|
||||||
|
tippecanoe. The dashboard serves the resulting archive through
|
||||||
|
``/api/overlays/property-borders`` and renders it as thin outlines only at the
|
||||||
|
postcode zoom level.
|
||||||
|
|
||||||
|
The same ZIPs are already downloaded for postcode-boundary generation; this
|
||||||
|
target re-uses :func:`parse_inspire_zip` to stay self-contained and is wired to
|
||||||
|
the ``$(INSPIRE_STAMP)`` make dependency rather than the boundary cache.
|
||||||
|
|
||||||
|
Data: HM Land Registry INSPIRE Index Polygons, Open Government Licence v3.0.
|
||||||
|
Boundaries are indicative "general boundaries", not the legal extent of title.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import shapely
|
||||||
|
from pyproj import Transformer
|
||||||
|
from shapely.geometry import Polygon
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
from pipeline.transform.postcode_boundaries.inspire import parse_inspire_zip
|
||||||
|
|
||||||
|
|
||||||
|
def _require_tippecanoe() -> str:
|
||||||
|
executable = shutil.which("tippecanoe")
|
||||||
|
if executable is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"tippecanoe is required to build property border PMTiles. "
|
||||||
|
"Install tippecanoe and rerun this target."
|
||||||
|
)
|
||||||
|
return executable
|
||||||
|
|
||||||
|
|
||||||
|
def _write_property_geojsonseq(inspire_dir: Path, output_path: Path) -> int:
|
||||||
|
"""Stream INSPIRE parcels to a WGS84 GeoJSONSeq file, one feature per line.
|
||||||
|
|
||||||
|
Features carry no properties — the overlay only draws outlines, so dropping
|
||||||
|
attributes keeps the tiles as small as possible. Reprojection and GeoJSON
|
||||||
|
encoding are vectorised per ZIP (one local authority) to bound memory while
|
||||||
|
staying in shapely's C path.
|
||||||
|
"""
|
||||||
|
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||||
|
zip_files = sorted(inspire_dir.glob("*.zip"))
|
||||||
|
if not zip_files:
|
||||||
|
raise RuntimeError(f"No INSPIRE ZIP files found in {inspire_dir}")
|
||||||
|
|
||||||
|
feature_count = 0
|
||||||
|
with output_path.open("w") as file:
|
||||||
|
for zip_path in tqdm(zip_files, desc="INSPIRE ZIPs", unit="file"):
|
||||||
|
rings = parse_inspire_zip(zip_path) # list of Nx2 (easting, northing)
|
||||||
|
if not rings:
|
||||||
|
continue
|
||||||
|
|
||||||
|
geoms = np.array([Polygon(coords) for coords in rings], dtype=object)
|
||||||
|
# interleaved=False → transform(x, y) called once with full arrays.
|
||||||
|
geoms = shapely.transform(geoms, to_wgs84.transform, interleaved=False)
|
||||||
|
|
||||||
|
for geometry_json in shapely.to_geojson(geoms):
|
||||||
|
file.write('{"type":"Feature","properties":{},"geometry":')
|
||||||
|
file.write(geometry_json)
|
||||||
|
file.write("}\n")
|
||||||
|
feature_count += 1
|
||||||
|
|
||||||
|
return feature_count
|
||||||
|
|
||||||
|
|
||||||
|
def build_property_border_tiles(
|
||||||
|
inspire_dir: Path,
|
||||||
|
output_path: Path,
|
||||||
|
min_zoom: int,
|
||||||
|
max_zoom: int,
|
||||||
|
) -> None:
|
||||||
|
tippecanoe = _require_tippecanoe()
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||||
|
ndjson_path = Path(tmp) / "property_borders.geojsonseq"
|
||||||
|
feature_count = _write_property_geojsonseq(inspire_dir, ndjson_path)
|
||||||
|
print(f"Writing {feature_count:,} INSPIRE parcel polygons")
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
tippecanoe,
|
||||||
|
"--force",
|
||||||
|
"--output",
|
||||||
|
str(output_path),
|
||||||
|
"--layer",
|
||||||
|
"property_borders",
|
||||||
|
"--minimum-zoom",
|
||||||
|
str(min_zoom),
|
||||||
|
"--maximum-zoom",
|
||||||
|
str(max_zoom),
|
||||||
|
# Borders are only meaningful at street level; thin the densest
|
||||||
|
# tiles at low zoom but keep full geometry at max zoom.
|
||||||
|
"--drop-smallest-as-needed",
|
||||||
|
"--simplify-only-low-zooms",
|
||||||
|
"--extend-zooms-if-still-dropping",
|
||||||
|
"--temporary-directory",
|
||||||
|
tmp,
|
||||||
|
str(ndjson_path),
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--inspire", type=Path, required=True, help="INSPIRE ZIP directory"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output .pmtiles path"
|
||||||
|
)
|
||||||
|
parser.add_argument("--min-zoom", type=int, default=12)
|
||||||
|
parser.add_argument("--max-zoom", type=int, default=16)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
build_property_border_tiles(
|
||||||
|
inspire_dir=args.inspire,
|
||||||
|
output_path=args.output,
|
||||||
|
min_zoom=args.min_zoom,
|
||||||
|
max_zoom=args.max_zoom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
147
pipeline/transform/test_crime_spatial.py
Normal file
147
pipeline/transform/test_crime_spatial.py
Normal file
|
|
@ -0,0 +1,147 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
import polars as pl
|
||||||
|
from pyproj import Transformer
|
||||||
|
|
||||||
|
from pipeline.transform.crime_spatial import transform_crime_spatial
|
||||||
|
|
||||||
|
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||||
|
|
||||||
|
_CSV_HEADER = (
|
||||||
|
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
|
||||||
|
"LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]:
|
||||||
|
lon, lat = _TO_WGS84.transform(x, y)
|
||||||
|
return lon, lat
|
||||||
|
|
||||||
|
|
||||||
|
def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict:
|
||||||
|
ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)]
|
||||||
|
coords = [list(_bng_to_wgs84(x, y)) for x, y in ring]
|
||||||
|
return {
|
||||||
|
"type": "Feature",
|
||||||
|
"properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")},
|
||||||
|
"geometry": {"type": "Polygon", "coordinates": [coords]},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None:
|
||||||
|
units_dir.mkdir(parents=True)
|
||||||
|
for district, features in features_by_district.items():
|
||||||
|
collection = {"type": "FeatureCollection", "features": features}
|
||||||
|
(units_dir / f"{district}.geojson").write_text(json.dumps(collection))
|
||||||
|
|
||||||
|
|
||||||
|
def _crime_row(month: str, x, y, crime_type: str) -> str:
|
||||||
|
if x is None or y is None:
|
||||||
|
lon, lat = "", ""
|
||||||
|
else:
|
||||||
|
lon, lat = _bng_to_wgs84(x, y)
|
||||||
|
return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
|
||||||
|
|
||||||
|
|
||||||
|
def _write_month(crime_dir, month: str, rows: list[str]) -> None:
|
||||||
|
month_dir = crime_dir / month
|
||||||
|
month_dir.mkdir(parents=True)
|
||||||
|
body = "\n".join([_CSV_HEADER, *rows]) + "\n"
|
||||||
|
(month_dir / f"{month}-test-force-street.csv").write_text(body)
|
||||||
|
|
||||||
|
|
||||||
|
def test_buffer_overlap_counts_for_each_postcode(tmp_path):
|
||||||
|
units = tmp_path / "units"
|
||||||
|
# A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060].
|
||||||
|
_write_boundaries(
|
||||||
|
units,
|
||||||
|
{
|
||||||
|
"AB1": [
|
||||||
|
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
||||||
|
_square_feature("AB1 1AB", 1080, 1000, 1090, 1010),
|
||||||
|
_square_feature("AB1 1AC", 5000, 5000, 5010, 5010),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
crime = tmp_path / "crime"
|
||||||
|
_write_month(
|
||||||
|
crime,
|
||||||
|
"2024-01",
|
||||||
|
[
|
||||||
|
# In the overlap: 35m east of A, 35m west of B -> counts for both.
|
||||||
|
_crime_row("2024-01", 1045, 1005, "Burglary"),
|
||||||
|
# 49m east of C's edge -> inside C's buffer.
|
||||||
|
_crime_row("2024-01", 5059, 5005, "Robbery"),
|
||||||
|
# 51m east of C's edge -> outside every buffer.
|
||||||
|
_crime_row("2024-01", 5061, 5005, "Robbery"),
|
||||||
|
# No coordinate -> dropped entirely.
|
||||||
|
_crime_row("2024-01", None, None, "Anti-social behaviour"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tmp_path / "crime_by_postcode.parquet"
|
||||||
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||||
|
transform_crime_spatial(crime, units, output, by_year)
|
||||||
|
|
||||||
|
rows = {
|
||||||
|
r["postcode"]: r
|
||||||
|
for r in pl.read_parquet(output).to_dicts()
|
||||||
|
}
|
||||||
|
# Single month -> annualised x12.
|
||||||
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
|
||||||
|
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
|
||||||
|
assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
|
||||||
|
# Only the 49m robbery counts for C; the 51m one and the blank row do not.
|
||||||
|
assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0
|
||||||
|
assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0
|
||||||
|
# Anti-social behaviour had no coordinate -> nobody gets it.
|
||||||
|
assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values())
|
||||||
|
|
||||||
|
|
||||||
|
def test_by_year_annualises_and_rolls_up(tmp_path):
|
||||||
|
units = tmp_path / "units"
|
||||||
|
_write_boundaries(
|
||||||
|
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||||
|
)
|
||||||
|
|
||||||
|
crime = tmp_path / "crime"
|
||||||
|
# Point at the centre of AB1 1AA, well inside its buffer.
|
||||||
|
_write_month(
|
||||||
|
crime,
|
||||||
|
"2023-01",
|
||||||
|
[
|
||||||
|
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
||||||
|
_crime_row("2023-01", 1005, 1005, "Robbery"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
|
||||||
|
_write_month(
|
||||||
|
crime,
|
||||||
|
"2024-02",
|
||||||
|
[
|
||||||
|
_crime_row("2024-02", 1005, 1005, "Burglary"),
|
||||||
|
_crime_row("2024-02", 1005, 1005, "Anti-social behaviour"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tmp_path / "crime_by_postcode.parquet"
|
||||||
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||||
|
transform_crime_spatial(crime, units, output, by_year)
|
||||||
|
|
||||||
|
by_year_df = pl.read_parquet(by_year)
|
||||||
|
assert by_year_df.height == 1
|
||||||
|
cols = set(by_year_df.columns)
|
||||||
|
assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols
|
||||||
|
|
||||||
|
row = by_year_df.row(0, named=True)
|
||||||
|
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
|
||||||
|
# 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
|
||||||
|
assert burglary == [
|
||||||
|
{"year": 2023, "count": 12.0},
|
||||||
|
{"year": 2024, "count": 12.0},
|
||||||
|
]
|
||||||
|
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
|
||||||
|
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
|
||||||
|
assert serious[2023] == 24.0
|
||||||
|
assert serious[2024] == 12.0
|
||||||
|
|
@ -24,6 +24,7 @@ def _row(**overrides: str) -> dict[str, str]:
|
||||||
row = {
|
row = {
|
||||||
"address": "1 Example Street",
|
"address": "1 Example Street",
|
||||||
"postcode": " aa1 1aa ",
|
"postcode": " aa1 1aa ",
|
||||||
|
"uprn": "100012345678",
|
||||||
"current_energy_rating": "c",
|
"current_energy_rating": "c",
|
||||||
"potential_energy_rating": "b",
|
"potential_energy_rating": "b",
|
||||||
"property_type": "House",
|
"property_type": "House",
|
||||||
|
|
@ -52,6 +53,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
||||||
{
|
{
|
||||||
"epc_address": "1 Example Street",
|
"epc_address": "1 Example Street",
|
||||||
"epc_postcode": "AA1 1AA",
|
"epc_postcode": "AA1 1AA",
|
||||||
|
"uprn": "100012345678",
|
||||||
"current_energy_rating": "C",
|
"current_energy_rating": "C",
|
||||||
"potential_energy_rating": "B",
|
"potential_energy_rating": "B",
|
||||||
"epc_property_type": "House",
|
"epc_property_type": "House",
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,8 @@ from pipeline.transform.merge import (
|
||||||
_finalize_listings,
|
_finalize_listings,
|
||||||
_integrate_listings,
|
_integrate_listings,
|
||||||
_match_direct_epc,
|
_match_direct_epc,
|
||||||
|
_match_listing_properties,
|
||||||
|
_normalize_uprn,
|
||||||
_is_dynamic_poi_metric_column,
|
_is_dynamic_poi_metric_column,
|
||||||
_less_deprived_percentile_expr,
|
_less_deprived_percentile_expr,
|
||||||
_load_conservation_area_geometries,
|
_load_conservation_area_geometries,
|
||||||
|
|
@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None:
|
||||||
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
||||||
|
|
||||||
|
|
||||||
|
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
|
||||||
|
# Crime is now a raw spatial count per postcode; the per-1k-residents
|
||||||
|
# variants were dropped along with the LSOA population denominator.
|
||||||
|
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
|
||||||
|
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
|
||||||
|
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
||||||
|
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
||||||
|
|
||||||
|
|
||||||
def test_listed_building_feature_is_property_level() -> None:
|
def test_listed_building_feature_is_property_level() -> None:
|
||||||
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
||||||
|
|
||||||
|
|
@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
|
||||||
assert seed["was_council_house"].to_list() == ["No"]
|
assert seed["was_council_house"].to_list() == ["No"]
|
||||||
|
|
||||||
|
|
||||||
def test_match_direct_epc_considers_nearby_postcodes() -> None:
|
_DIRECT_EPC_CANDIDATE_SCHEMA = {
|
||||||
listing_matches = pl.DataFrame(
|
"_direct_epc_row": pl.UInt32,
|
||||||
{
|
"_direct_epc_match_address": pl.Utf8,
|
||||||
"_listing_idx": [0],
|
"_direct_epc_match_postcode": pl.Utf8,
|
||||||
"_listing_match_address": ["1 EXAMPLE ROAD"],
|
"_direct_epc_outcode": pl.Utf8,
|
||||||
"_listing_match_postcode": ["AA11AA"],
|
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||||
"_listing_east": [1000.0],
|
"_direct_epc_uprn": pl.Utf8,
|
||||||
"_listing_north": [1000.0],
|
"_direct_epc_address": pl.Utf8,
|
||||||
"_actual_property_type": ["Terraced"],
|
"_direct_current_energy_rating": pl.Utf8,
|
||||||
"_actual_total_floor_area": [100.0],
|
"_direct_potential_energy_rating": pl.Utf8,
|
||||||
"_actual_number_habitable_rooms": [4],
|
"_direct_total_floor_area": pl.Float64,
|
||||||
},
|
"_direct_number_habitable_rooms": pl.Int16,
|
||||||
schema={
|
"_direct_floor_height": pl.Float64,
|
||||||
"_listing_idx": pl.UInt32,
|
"_direct_construction_age_band": pl.UInt16,
|
||||||
"_listing_match_address": pl.Utf8,
|
"_direct_is_construction_date_approximate": pl.UInt8,
|
||||||
"_listing_match_postcode": pl.Utf8,
|
"_direct_was_council_house": pl.Utf8,
|
||||||
"_listing_east": pl.Float64,
|
}
|
||||||
"_listing_north": pl.Float64,
|
|
||||||
"_actual_property_type": pl.Utf8,
|
_LISTING_MATCH_SCHEMA = {
|
||||||
"_actual_total_floor_area": pl.Float64,
|
"_listing_idx": pl.UInt32,
|
||||||
"_actual_number_habitable_rooms": pl.Int16,
|
"_listing_match_address": pl.Utf8,
|
||||||
},
|
"_listing_match_postcode": pl.Utf8,
|
||||||
)
|
"_listing_uprn": pl.Utf8,
|
||||||
epc_candidates = pl.DataFrame(
|
}
|
||||||
{
|
|
||||||
"_direct_epc_row": [0],
|
|
||||||
"_direct_epc_match_address": ["1 EXAMPLE ROAD"],
|
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||||
"_direct_epc_match_postcode": ["BB11BB"],
|
base = {
|
||||||
"_direct_epc_east": [1020.0],
|
"_direct_epc_row": 0,
|
||||||
"_direct_epc_north": [1010.0],
|
"_direct_epc_match_address": "1 EXAMPLE ROAD",
|
||||||
"_direct_epc_canonical_property_type": ["Terraced"],
|
"_direct_epc_match_postcode": "AA11AA",
|
||||||
"_direct_epc_address": ["1, Example Road"],
|
"_direct_epc_outcode": "AA1",
|
||||||
"_direct_current_energy_rating": ["C"],
|
"_direct_epc_canonical_property_type": "Terraced",
|
||||||
"_direct_potential_energy_rating": ["B"],
|
"_direct_epc_uprn": None,
|
||||||
"_direct_total_floor_area": [101.0],
|
"_direct_epc_address": "1, Example Road",
|
||||||
"_direct_number_habitable_rooms": [4],
|
"_direct_current_energy_rating": "C",
|
||||||
"_direct_floor_height": [2.5],
|
"_direct_potential_energy_rating": "B",
|
||||||
"_direct_construction_age_band": [1930],
|
"_direct_total_floor_area": 101.0,
|
||||||
"_direct_is_construction_date_approximate": [1],
|
"_direct_number_habitable_rooms": 4,
|
||||||
"_direct_was_council_house": ["No"],
|
"_direct_floor_height": 2.5,
|
||||||
},
|
"_direct_construction_age_band": 1930,
|
||||||
schema={
|
"_direct_is_construction_date_approximate": 1,
|
||||||
"_direct_epc_row": pl.UInt32,
|
"_direct_was_council_house": "No",
|
||||||
"_direct_epc_match_address": pl.Utf8,
|
}
|
||||||
"_direct_epc_match_postcode": pl.Utf8,
|
return pl.DataFrame(
|
||||||
"_direct_epc_east": pl.Float64,
|
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
|
||||||
"_direct_epc_north": pl.Float64,
|
|
||||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
|
||||||
"_direct_epc_address": pl.Utf8,
|
|
||||||
"_direct_current_energy_rating": pl.Utf8,
|
|
||||||
"_direct_potential_energy_rating": pl.Utf8,
|
|
||||||
"_direct_total_floor_area": pl.Float64,
|
|
||||||
"_direct_number_habitable_rooms": pl.Int16,
|
|
||||||
"_direct_floor_height": pl.Float64,
|
|
||||||
"_direct_construction_age_band": pl.UInt16,
|
|
||||||
"_direct_is_construction_date_approximate": pl.UInt8,
|
|
||||||
"_direct_was_council_house": pl.Utf8,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
matches = _match_direct_epc(listing_matches, epc_candidates)
|
|
||||||
|
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
|
||||||
|
base = {
|
||||||
|
"_listing_idx": 0,
|
||||||
|
"_listing_match_address": "1 EXAMPLE ROAD",
|
||||||
|
"_listing_match_postcode": "AA11AA",
|
||||||
|
"_listing_uprn": None,
|
||||||
|
}
|
||||||
|
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
|
||||||
|
# UPRN is matched globally (not within a postcode bucket), so a listing
|
||||||
|
# whose detail-page postcode is slightly off still resolves to the right
|
||||||
|
# EPC certificate by its UPRN.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches(
|
||||||
|
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
|
||||||
|
),
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
assert matches.height == 1
|
assert matches.height == 1
|
||||||
assert matches["_listing_idx"].to_list() == [0]
|
|
||||||
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
|
||||||
|
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
||||||
|
assert matches["_direct_epc_match_method"].to_list() == ["address"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_uprn_handles_types_and_floats() -> None:
|
||||||
|
assert _normalize_uprn(None) is None
|
||||||
|
assert _normalize_uprn("") is None
|
||||||
|
assert _normalize_uprn(" 100012345678 ") == "100012345678"
|
||||||
|
assert _normalize_uprn(100012345678) == "100012345678"
|
||||||
|
# An integral float normalises to its digits, NOT "1230".
|
||||||
|
assert _normalize_uprn(123.0) == "123"
|
||||||
|
# Non-integral / NaN floats are rejected rather than mangled.
|
||||||
|
assert _normalize_uprn(1.5) is None
|
||||||
|
assert _normalize_uprn(float("nan")) is None
|
||||||
|
|
||||||
|
|
||||||
|
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||||
|
base = {
|
||||||
|
"postcode": "AA1 1AA",
|
||||||
|
"pp_address": "1 Example Road",
|
||||||
|
"_property_match_postcode": "AA11AA",
|
||||||
|
"_property_match_address": "1 EXAMPLE ROAD",
|
||||||
|
"_property_epc_match_address": "1 EXAMPLE ROAD",
|
||||||
|
"uprn": None,
|
||||||
|
}
|
||||||
|
return pl.DataFrame(
|
||||||
|
[{**base, **row} for row in rows],
|
||||||
|
schema={
|
||||||
|
"postcode": pl.Utf8,
|
||||||
|
"pp_address": pl.Utf8,
|
||||||
|
"_property_match_postcode": pl.Utf8,
|
||||||
|
"_property_match_address": pl.Utf8,
|
||||||
|
"_property_epc_match_address": pl.Utf8,
|
||||||
|
"uprn": pl.Utf8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
|
||||||
|
# Two listings claim the same property: one by UPRN, one by exact address
|
||||||
|
# (both score 100). The UPRN match must win even though it has the higher
|
||||||
|
# _listing_idx (which would otherwise break the tie the wrong way).
|
||||||
|
listings = _listing_matches(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_listing_idx": 5,
|
||||||
|
"_listing_uprn": "100000000001",
|
||||||
|
"_listing_match_address": "SOMETHING ELSE",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_listing_idx": 1,
|
||||||
|
"_listing_uprn": None,
|
||||||
|
"_listing_match_address": "1 EXAMPLE ROAD",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
matches = _match_listing_properties(
|
||||||
|
listings, _property_candidates([{"uprn": "100000000001"}])
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 1
|
||||||
|
assert matches["_listing_idx"].to_list() == [5]
|
||||||
|
assert matches["_property_match_method"].to_list() == ["uprn"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
|
||||||
|
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
|
||||||
|
# same-street EPC in a different postcode with no shared UPRN is skipped.
|
||||||
|
matches = _match_direct_epc(
|
||||||
|
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
|
||||||
|
_direct_epc_candidates(
|
||||||
|
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matches.height == 0
|
||||||
|
|
||||||
|
|
||||||
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
|
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
|
||||||
|
|
@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -
|
||||||
assert other["_actual_listing_url"].to_list() == [None]
|
assert other["_actual_listing_url"].to_list() == [None]
|
||||||
|
|
||||||
|
|
||||||
def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
|
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
|
||||||
|
# The listing's address deliberately does not match the property's, but the
|
||||||
|
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
|
||||||
listings_path = tmp_path / "listings.parquet"
|
listings_path = tmp_path / "listings.parquet"
|
||||||
arcgis_path = tmp_path / "arcgis.parquet"
|
arcgis_path = tmp_path / "arcgis.parquet"
|
||||||
_sample_listings_frame().with_columns(
|
_sample_listings_frame().with_columns(
|
||||||
pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
|
pl.lit("Totally Different Road").alias("Address per Property Register"),
|
||||||
|
pl.lit("100000000009").alias("UPRN"),
|
||||||
|
).write_parquet(listings_path)
|
||||||
|
_stub_arcgis(arcgis_path)
|
||||||
|
wide = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"postcode": ["SW1A 1AA"],
|
||||||
|
"pp_address": ["1 Example Road"],
|
||||||
|
"uprn": ["100000000009"],
|
||||||
|
"pp_property_type": ["Terraced"],
|
||||||
|
"duration": ["Freehold"],
|
||||||
|
"total_floor_area": [90.0],
|
||||||
|
"number_habitable_rooms": [4],
|
||||||
|
"latest_price": [600_000],
|
||||||
|
"epc_address": ["1 Example Road"],
|
||||||
|
"current_energy_rating": ["C"],
|
||||||
|
"potential_energy_rating": ["B"],
|
||||||
|
"floor_height": [2.4],
|
||||||
|
"construction_age_band": [1930],
|
||||||
|
"is_construction_date_approximate": [1],
|
||||||
|
"was_council_house": ["No"],
|
||||||
|
},
|
||||||
|
schema={
|
||||||
|
"postcode": pl.Utf8,
|
||||||
|
"pp_address": pl.Utf8,
|
||||||
|
"uprn": pl.Utf8,
|
||||||
|
"pp_property_type": pl.Utf8,
|
||||||
|
"duration": pl.Utf8,
|
||||||
|
"total_floor_area": pl.Float64,
|
||||||
|
"number_habitable_rooms": pl.Int16,
|
||||||
|
"latest_price": pl.Int64,
|
||||||
|
"epc_address": pl.Utf8,
|
||||||
|
"current_energy_rating": pl.Utf8,
|
||||||
|
"potential_energy_rating": pl.Utf8,
|
||||||
|
"floor_height": pl.Float64,
|
||||||
|
"construction_age_band": pl.UInt16,
|
||||||
|
"is_construction_date_approximate": pl.UInt8,
|
||||||
|
"was_council_house": pl.Utf8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
integrated = _integrate_listings(
|
||||||
|
wide.lazy(), listings_path, arcgis_path, epc_path=None
|
||||||
|
).collect()
|
||||||
|
|
||||||
|
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
|
||||||
|
# The listing overlay attached to the UPRN-matched property row.
|
||||||
|
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||||
|
# No spurious seed row for the listing's (non-matching) address.
|
||||||
|
assert "Totally Different Road" not in integrated["pp_address"].to_list()
|
||||||
|
|
||||||
|
|
||||||
|
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
|
||||||
|
# A number-less listing whose street is not the property's street (and which
|
||||||
|
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
|
||||||
|
# row instead of stamping the wrong property's overlay.
|
||||||
|
listings_path = tmp_path / "listings.parquet"
|
||||||
|
arcgis_path = tmp_path / "arcgis.parquet"
|
||||||
|
_sample_listings_frame().with_columns(
|
||||||
|
pl.lit("Juniper Crescent").alias("Address per Property Register"),
|
||||||
).write_parquet(listings_path)
|
).write_parquet(listings_path)
|
||||||
_stub_arcgis(arcgis_path)
|
_stub_arcgis(arcgis_path)
|
||||||
wide = pl.DataFrame(
|
wide = pl.DataFrame(
|
||||||
|
|
@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) ->
|
||||||
).collect()
|
).collect()
|
||||||
|
|
||||||
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
|
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
|
||||||
seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
|
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
|
||||||
assert existing["_actual_listing_url"].to_list() == [None]
|
assert existing["_actual_listing_url"].to_list() == [None]
|
||||||
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||||
|
|
||||||
|
|
@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
||||||
# Overlay scaffolding is dropped.
|
# Overlay scaffolding is dropped.
|
||||||
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
|
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
|
||||||
assert dst not in finalized.columns, src
|
assert dst not in finalized.columns, src
|
||||||
|
|
||||||
|
|
||||||
|
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
||||||
|
# The terminated-postcode remap can collapse two distinct wide rows onto the same
|
||||||
|
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
|
||||||
|
# must emit one row per listing URL, not one per collapsed wide row.
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
|
||||||
|
"Address per Property Register": ["1 Example Road", "1 Example Road"],
|
||||||
|
"Address per EPC": ["1 Example Road", "1 Example Road"],
|
||||||
|
"Date of last transaction": [1990.0, 1995.0],
|
||||||
|
"lat": [51.5, 51.5],
|
||||||
|
"lon": [-0.1, -0.1],
|
||||||
|
"Total floor area (sqm)": [100.0, 95.0],
|
||||||
|
"Number of bedrooms & living rooms": [3, 3],
|
||||||
|
"Property type": ["Terraced", "Terraced"],
|
||||||
|
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
|
||||||
|
"Last known price": [500_000, 480_000],
|
||||||
|
"Street tree density percentile": [42.0, 42.0],
|
||||||
|
# Same listing URL on both collapsed rows — the fan-out to fix.
|
||||||
|
"_actual_listing_url": ["url0", "url0"],
|
||||||
|
"_actual_asking_price": [600_000, 600_000],
|
||||||
|
"_actual_asking_price_per_sqm": [5_000, 5_000],
|
||||||
|
"_actual_listing_date": [None, None],
|
||||||
|
"_actual_listing_status": ["For sale", "For sale"],
|
||||||
|
"_actual_listing_features": [["Garden"], ["Garden"]],
|
||||||
|
"_actual_bedrooms": [3, 3],
|
||||||
|
"_actual_bathrooms": [1, 1],
|
||||||
|
"_actual_price_qualifier": ["", ""],
|
||||||
|
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
|
||||||
|
"_actual_lat": [51.51, 51.51],
|
||||||
|
"_actual_lon": [-0.11, -0.11],
|
||||||
|
"_actual_total_floor_area": [110.0, 110.0],
|
||||||
|
"_actual_number_habitable_rooms": [4, 4],
|
||||||
|
"_actual_property_type": ["Terraced", "Terraced"],
|
||||||
|
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
|
||||||
|
},
|
||||||
|
schema={
|
||||||
|
"Postcode": pl.Utf8,
|
||||||
|
"Address per Property Register": pl.Utf8,
|
||||||
|
"Address per EPC": pl.Utf8,
|
||||||
|
"Date of last transaction": pl.Float64,
|
||||||
|
"lat": pl.Float64,
|
||||||
|
"lon": pl.Float64,
|
||||||
|
"Total floor area (sqm)": pl.Float64,
|
||||||
|
"Number of bedrooms & living rooms": pl.Int16,
|
||||||
|
"Property type": pl.Utf8,
|
||||||
|
"Leasehold/Freehold": pl.Utf8,
|
||||||
|
"Last known price": pl.Int64,
|
||||||
|
"Street tree density percentile": pl.Float32,
|
||||||
|
"_actual_listing_url": pl.Utf8,
|
||||||
|
"_actual_asking_price": pl.Int64,
|
||||||
|
"_actual_asking_price_per_sqm": pl.Int32,
|
||||||
|
"_actual_listing_date": pl.Datetime("us"),
|
||||||
|
"_actual_listing_status": pl.Utf8,
|
||||||
|
"_actual_listing_features": pl.List(pl.Utf8),
|
||||||
|
"_actual_bedrooms": pl.Int32,
|
||||||
|
"_actual_bathrooms": pl.Int32,
|
||||||
|
"_actual_price_qualifier": pl.Utf8,
|
||||||
|
"_actual_property_sub_type": pl.Utf8,
|
||||||
|
"_actual_lat": pl.Float64,
|
||||||
|
"_actual_lon": pl.Float64,
|
||||||
|
"_actual_total_floor_area": pl.Float64,
|
||||||
|
"_actual_number_habitable_rooms": pl.Int16,
|
||||||
|
"_actual_property_type": pl.Utf8,
|
||||||
|
"_actual_leasehold_freehold": pl.Utf8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
finalized = _finalize_listings(df)
|
||||||
|
|
||||||
|
assert finalized.height == 1
|
||||||
|
assert finalized["Listing URL"].to_list() == ["url0"]
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,83 @@
|
||||||
|
import math
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
import pytest
|
import pytest
|
||||||
|
import shapely
|
||||||
|
|
||||||
from pipeline.transform.tree_density import (
|
from pipeline.transform.tree_density import (
|
||||||
STREET_TREE_COVERAGE_COL,
|
STREET_TREE_COVERAGE_COL,
|
||||||
STREET_TREE_DENSITY_COL,
|
STREET_TREE_DENSITY_COL,
|
||||||
|
_add_nfi_batch,
|
||||||
_coverage_percentile_expr,
|
_coverage_percentile_expr,
|
||||||
_metric_columns,
|
_metric_columns,
|
||||||
|
_postcode_buffers,
|
||||||
_postcode_density_percentile_col,
|
_postcode_density_percentile_col,
|
||||||
_with_postcode_density_percentiles,
|
_with_postcode_density_percentiles,
|
||||||
_write_street_rollups,
|
_write_street_rollups,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
|
||||||
|
radius_m = 50
|
||||||
|
points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
|
||||||
|
circles, tree = _postcode_buffers(points, radius_m)
|
||||||
|
buffer_area = math.pi * radius_m * radius_m
|
||||||
|
|
||||||
|
# A large woodland square centred on postcode A fully covers A's circle.
|
||||||
|
canopy_area = np.zeros(2)
|
||||||
|
feature_count = np.zeros(2, dtype=np.uint32)
|
||||||
|
big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel
|
||||||
|
_add_nfi_batch(
|
||||||
|
np.array([big], dtype=object),
|
||||||
|
np.array(["Woodland"], dtype=object),
|
||||||
|
circles,
|
||||||
|
tree,
|
||||||
|
canopy_area,
|
||||||
|
feature_count,
|
||||||
|
radius_m,
|
||||||
|
)
|
||||||
|
# Only the clipped circle area is added (the 32-gon buffer approximates the
|
||||||
|
# circle to ~1%), NOT the full 1,000,000 sqm polygon.
|
||||||
|
assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
|
||||||
|
assert canopy_area[0] <= buffer_area # never exceeds the buffer area
|
||||||
|
assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap
|
||||||
|
assert feature_count.tolist() == [1, 0]
|
||||||
|
|
||||||
|
# A large parcel that only slivers into B's circle must add only the sliver,
|
||||||
|
# not its full area -- the failure mode the old centroid path could not avoid.
|
||||||
|
canopy_area = np.zeros(2)
|
||||||
|
feature_count = np.zeros(2, dtype=np.uint32)
|
||||||
|
sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle
|
||||||
|
_add_nfi_batch(
|
||||||
|
np.array([sliver], dtype=object),
|
||||||
|
np.array(["Woodland"], dtype=object),
|
||||||
|
circles,
|
||||||
|
tree,
|
||||||
|
canopy_area,
|
||||||
|
feature_count,
|
||||||
|
radius_m,
|
||||||
|
)
|
||||||
|
assert canopy_area[0] == 0.0
|
||||||
|
assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm
|
||||||
|
|
||||||
|
# Non-woodland categories contribute nothing.
|
||||||
|
canopy_area = np.zeros(2)
|
||||||
|
feature_count = np.zeros(2, dtype=np.uint32)
|
||||||
|
_add_nfi_batch(
|
||||||
|
np.array([big], dtype=object),
|
||||||
|
np.array(["Non woodland"], dtype=object),
|
||||||
|
circles,
|
||||||
|
tree,
|
||||||
|
canopy_area,
|
||||||
|
feature_count,
|
||||||
|
radius_m,
|
||||||
|
)
|
||||||
|
assert canopy_area.tolist() == [0.0, 0.0]
|
||||||
|
assert feature_count.tolist() == [0, 0]
|
||||||
|
|
||||||
|
|
||||||
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
|
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
|
||||||
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})
|
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,16 @@
|
||||||
"""Derive street-scale tree density metrics from Forest Research TOW data.
|
"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
|
||||||
|
|
||||||
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
||||||
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
||||||
postcode-level metric from the tree polygons, then optionally rolls that up to
|
postcode-level metric from the tree polygons, then optionally rolls that up to
|
||||||
Price Paid street names so the dashboard can answer "what is this address's
|
Price Paid street names so the dashboard can answer "what is this address's
|
||||||
street like?" without loading the full geodatabase at runtime.
|
street like?" without loading the full geodatabase at runtime.
|
||||||
|
|
||||||
|
TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
|
||||||
|
woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
|
||||||
|
proximity (tiny crowns), while large NFI woodland parcels are accumulated by
|
||||||
|
true buffer-clipped intersection area so they cannot saturate a postcode from
|
||||||
|
mere centroid proximity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -22,7 +28,6 @@ import shapely
|
||||||
from scipy.spatial import cKDTree
|
from scipy.spatial import cKDTree
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
|
|
||||||
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
||||||
STREET_TREE_DENSITY_COL = "Street tree density percentile"
|
STREET_TREE_DENSITY_COL = "Street tree density percentile"
|
||||||
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
|
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
|
||||||
|
|
@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
|
||||||
POSTCODE_COUNT_COL = "Tree features within {radius}m"
|
POSTCODE_COUNT_COL = "Tree features within {radius}m"
|
||||||
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
||||||
|
|
||||||
|
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
|
||||||
|
# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
|
||||||
|
# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
|
||||||
|
NFI_CATEGORY_COL = "CATEGORY"
|
||||||
|
NFI_WOODLAND_VALUE = "Woodland"
|
||||||
|
NFI_TYPE_COL = "IFT_IOA"
|
||||||
|
NFI_AREA_HA_COL = "Area_ha"
|
||||||
|
|
||||||
|
|
||||||
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
||||||
"""Extract the TOW zip and return the extracted .gdb path."""
|
"""Extract the TOW zip and return the extracted .gdb path."""
|
||||||
|
|
@ -83,12 +96,60 @@ def _tow_dataset_path(
|
||||||
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
|
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
|
||||||
|
|
||||||
|
|
||||||
def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
|
def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
||||||
if not tow_types:
|
"""Extract an arbitrary zip into extract_dir and return the directory."""
|
||||||
return None
|
if extract_dir.exists() and not force:
|
||||||
escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
|
print(f"Using existing extraction directory: {extract_dir}")
|
||||||
values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
|
return extract_dir
|
||||||
return f"Woodland_Type IN ({values})"
|
if extract_dir.exists():
|
||||||
|
shutil.rmtree(extract_dir)
|
||||||
|
|
||||||
|
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
|
||||||
|
if tmp_dir.exists():
|
||||||
|
shutil.rmtree(tmp_dir)
|
||||||
|
tmp_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
root = tmp_dir.resolve()
|
||||||
|
print(f"Extracting {zip_path} to {extract_dir}...")
|
||||||
|
with zipfile.ZipFile(zip_path) as archive:
|
||||||
|
for member in archive.infolist():
|
||||||
|
target = (tmp_dir / member.filename).resolve()
|
||||||
|
if root != target and root not in target.parents:
|
||||||
|
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
|
||||||
|
if member.is_dir():
|
||||||
|
target.mkdir(parents=True, exist_ok=True)
|
||||||
|
continue
|
||||||
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with archive.open(member) as source, target.open("wb") as dest:
|
||||||
|
shutil.copyfileobj(source, dest, length=1024 * 1024)
|
||||||
|
|
||||||
|
tmp_dir.rename(extract_dir)
|
||||||
|
print(f"Extracted archive: {extract_dir}")
|
||||||
|
return extract_dir
|
||||||
|
|
||||||
|
|
||||||
|
def _nfi_dataset_path(
|
||||||
|
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
||||||
|
) -> str:
|
||||||
|
"""Resolve the NFI woodland shapefile path, extracting the zip if needed."""
|
||||||
|
if use_vsizip:
|
||||||
|
return f"/vsizip/{zip_path.resolve()}"
|
||||||
|
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
|
||||||
|
shapefiles = sorted(extracted.rglob("*.shp"))
|
||||||
|
if not shapefiles:
|
||||||
|
raise FileNotFoundError(f"No .shp found inside {zip_path}")
|
||||||
|
return str(shapefiles[0])
|
||||||
|
|
||||||
|
|
||||||
|
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
|
||||||
|
"""Resolve the geometry column name from pyogrio Arrow metadata."""
|
||||||
|
geometry_name = metadata.get("geometry_name")
|
||||||
|
if geometry_name:
|
||||||
|
return str(geometry_name)
|
||||||
|
for name in ("wkb_geometry", "geometry", "geom"):
|
||||||
|
if name in column_names:
|
||||||
|
return name
|
||||||
|
return column_names[-1]
|
||||||
|
|
||||||
|
|
||||||
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
|
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
|
||||||
|
|
@ -172,26 +233,20 @@ def _accumulate_tree_metrics(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
points: pl.DataFrame,
|
points: pl.DataFrame,
|
||||||
radius_m: int,
|
radius_m: int,
|
||||||
tow_types: tuple[str, ...] | None,
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
layer_names: tuple[str, ...] | None,
|
layer_names: tuple[str, ...] | None,
|
||||||
max_features_per_layer: int | None,
|
max_features_per_layer: int | None,
|
||||||
workers: int,
|
workers: int,
|
||||||
) -> pl.DataFrame:
|
canopy_area: np.ndarray,
|
||||||
|
feature_count: np.ndarray,
|
||||||
|
height_weighted_sum: np.ndarray,
|
||||||
|
height_weight: np.ndarray,
|
||||||
|
) -> None:
|
||||||
xy = points.select("x", "y").to_numpy()
|
xy = points.select("x", "y").to_numpy()
|
||||||
tree = cKDTree(xy)
|
tree = cKDTree(xy)
|
||||||
n_points = points.height
|
|
||||||
|
|
||||||
canopy_area = np.zeros(n_points, dtype=np.float64)
|
|
||||||
feature_count = np.zeros(n_points, dtype=np.uint32)
|
|
||||||
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
|
||||||
height_weight = np.zeros(n_points, dtype=np.float64)
|
|
||||||
|
|
||||||
where = _where_for_tow_types(tow_types)
|
|
||||||
layers = _layers(dataset_path, layer_names)
|
layers = _layers(dataset_path, layer_names)
|
||||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||||
if where:
|
|
||||||
print(f"TOW type filter: {where}")
|
|
||||||
|
|
||||||
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
|
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
|
||||||
total_features_seen = 0
|
total_features_seen = 0
|
||||||
|
|
@ -206,7 +261,6 @@ def _accumulate_tree_metrics(
|
||||||
dataset_path,
|
dataset_path,
|
||||||
layer=layer,
|
layer=layer,
|
||||||
columns=columns,
|
columns=columns,
|
||||||
where=where,
|
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
use_pyarrow=True,
|
use_pyarrow=True,
|
||||||
) as (_meta, reader):
|
) as (_meta, reader):
|
||||||
|
|
@ -297,6 +351,132 @@ def _accumulate_tree_metrics(
|
||||||
f"{total_features_used:,} features with usable centroids"
|
f"{total_features_used:,} features with usable centroids"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _postcode_buffers(
|
||||||
|
points: pl.DataFrame, radius_m: int
|
||||||
|
) -> tuple[np.ndarray, shapely.STRtree]:
|
||||||
|
"""Build a radius-r circle for every postcode plus an STRtree over them.
|
||||||
|
|
||||||
|
Circle index == postcode index, matching the order used by the cKDTree path.
|
||||||
|
"""
|
||||||
|
xy = points.select("x", "y").to_numpy()
|
||||||
|
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
|
||||||
|
return circles, shapely.STRtree(circles)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_nfi_batch(
|
||||||
|
geoms: np.ndarray,
|
||||||
|
category: np.ndarray,
|
||||||
|
circles: np.ndarray,
|
||||||
|
tree: shapely.STRtree,
|
||||||
|
canopy_area: np.ndarray,
|
||||||
|
feature_count: np.ndarray,
|
||||||
|
radius_m: int,
|
||||||
|
) -> None:
|
||||||
|
"""Add NFI woodland into the shared arrays by true buffer-clipped area.
|
||||||
|
|
||||||
|
Unlike the TOW centroid path, this clips each woodland polygon to each
|
||||||
|
nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
|
||||||
|
therefore cannot saturate a postcode from mere centroid proximity, and a
|
||||||
|
buffer-filling parcel whose centroid is outside the radius is not missed.
|
||||||
|
"""
|
||||||
|
keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
|
||||||
|
geoms = geoms[keep]
|
||||||
|
if geoms.size:
|
||||||
|
geoms = geoms[~shapely.is_empty(geoms)]
|
||||||
|
if geoms.size == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# dwithin(polygon, point, r) is true iff the radius-r circle around the
|
||||||
|
# point intersects the polygon -- exactly the candidate set we want.
|
||||||
|
nfi_index, postcode_index = tree.query(
|
||||||
|
geoms, predicate="dwithin", distance=radius_m
|
||||||
|
)
|
||||||
|
if nfi_index.size == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
clipped_area = shapely.area(
|
||||||
|
shapely.intersection(geoms[nfi_index], circles[postcode_index])
|
||||||
|
)
|
||||||
|
positive = clipped_area > 0
|
||||||
|
postcode_index = postcode_index[positive]
|
||||||
|
clipped_area = clipped_area[positive]
|
||||||
|
|
||||||
|
np.add.at(canopy_area, postcode_index, clipped_area)
|
||||||
|
np.add.at(feature_count, postcode_index, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _accumulate_nfi_metrics(
|
||||||
|
dataset_path: str,
|
||||||
|
circles: np.ndarray,
|
||||||
|
tree: shapely.STRtree,
|
||||||
|
canopy_area: np.ndarray,
|
||||||
|
feature_count: np.ndarray,
|
||||||
|
radius_m: int,
|
||||||
|
batch_size: int,
|
||||||
|
max_nfi_features: int | None,
|
||||||
|
) -> None:
|
||||||
|
layers = _layers(dataset_path, None)
|
||||||
|
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
|
||||||
|
|
||||||
|
# Density only needs the woodland flag + geometry; area is clipped from the
|
||||||
|
# postcode buffer, not read from the file.
|
||||||
|
columns = [NFI_CATEGORY_COL]
|
||||||
|
features_seen = 0
|
||||||
|
|
||||||
|
for layer in layers:
|
||||||
|
with pyogrio.open_arrow(
|
||||||
|
dataset_path,
|
||||||
|
layer=layer,
|
||||||
|
columns=columns,
|
||||||
|
batch_size=batch_size,
|
||||||
|
use_pyarrow=True,
|
||||||
|
) as (meta, reader):
|
||||||
|
for batch_index, batch in enumerate(reader, start=1):
|
||||||
|
if max_nfi_features is not None:
|
||||||
|
remaining = max_nfi_features - features_seen
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
if batch.num_rows > remaining:
|
||||||
|
batch = batch.slice(0, remaining)
|
||||||
|
|
||||||
|
features_seen += batch.num_rows
|
||||||
|
names = batch.schema.names
|
||||||
|
geometry_column = _geometry_column(meta, names)
|
||||||
|
category = np.asarray(
|
||||||
|
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
|
||||||
|
zero_copy_only=False
|
||||||
|
),
|
||||||
|
dtype=object,
|
||||||
|
)
|
||||||
|
geometry = np.asarray(
|
||||||
|
batch.column(names.index(geometry_column)).to_numpy(
|
||||||
|
zero_copy_only=False
|
||||||
|
),
|
||||||
|
dtype=object,
|
||||||
|
)
|
||||||
|
_add_nfi_batch(
|
||||||
|
shapely.from_wkb(geometry),
|
||||||
|
category,
|
||||||
|
circles,
|
||||||
|
tree,
|
||||||
|
canopy_area,
|
||||||
|
feature_count,
|
||||||
|
radius_m,
|
||||||
|
)
|
||||||
|
if batch_index == 1 or batch_index % 25 == 0:
|
||||||
|
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
|
||||||
|
|
||||||
|
|
||||||
|
def _finalize_metrics(
|
||||||
|
points: pl.DataFrame,
|
||||||
|
canopy_area: np.ndarray,
|
||||||
|
feature_count: np.ndarray,
|
||||||
|
height_weighted_sum: np.ndarray,
|
||||||
|
height_weight: np.ndarray,
|
||||||
|
radius_m: int,
|
||||||
|
) -> pl.DataFrame:
|
||||||
|
n_points = points.height
|
||||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||||
buffer_area = math.pi * radius_m * radius_m
|
buffer_area = math.pi * radius_m * radius_m
|
||||||
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
|
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
|
||||||
|
|
@ -518,6 +698,18 @@ def main() -> None:
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Read the geodatabase directly from the zip instead of extracting it",
|
help="Read the geodatabase directly from the zip instead of extracting it",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--nfi-zip",
|
||||||
|
type=Path,
|
||||||
|
default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
|
||||||
|
help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--nfi-extract-dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("property-data/nfi_woodland_england"),
|
||||||
|
help="Directory where the NFI zip is extracted",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--arcgis",
|
"--arcgis",
|
||||||
type=Path,
|
type=Path,
|
||||||
|
|
@ -554,11 +746,6 @@ def main() -> None:
|
||||||
default=50,
|
default=50,
|
||||||
help="Radius around each postcode centroid used as the street-scale buffer",
|
help="Radius around each postcode centroid used as the street-scale buffer",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--tow-types",
|
|
||||||
default=",".join(DEFAULT_TOW_TYPES),
|
|
||||||
help='Comma-separated Woodland_Type values to include, or "all"',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--layers",
|
"--layers",
|
||||||
default=None,
|
default=None,
|
||||||
|
|
@ -588,6 +775,12 @@ def main() -> None:
|
||||||
default=None,
|
default=None,
|
||||||
help="Testing only: process at most N TOW features per layer",
|
help="Testing only: process at most N TOW features per layer",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-nfi-features",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Testing only: process at most N NFI woodland features",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if (args.output_streets or args.output_addresses) and args.price_paid is None:
|
if (args.output_streets or args.output_addresses) and args.price_paid is None:
|
||||||
|
|
@ -600,18 +793,53 @@ def main() -> None:
|
||||||
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
|
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
|
||||||
)
|
)
|
||||||
points = _postcode_points(args.arcgis, args.max_postcodes)
|
points = _postcode_points(args.arcgis, args.max_postcodes)
|
||||||
tow_types = _parse_csv_arg(args.tow_types)
|
|
||||||
layer_names = _parse_csv_arg(args.layers)
|
layer_names = _parse_csv_arg(args.layers)
|
||||||
|
|
||||||
postcode_metrics = _accumulate_tree_metrics(
|
n_points = points.height
|
||||||
|
canopy_area = np.zeros(n_points, dtype=np.float64)
|
||||||
|
feature_count = np.zeros(n_points, dtype=np.uint32)
|
||||||
|
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
||||||
|
height_weight = np.zeros(n_points, dtype=np.float64)
|
||||||
|
|
||||||
|
_accumulate_tree_metrics(
|
||||||
dataset_path=dataset_path,
|
dataset_path=dataset_path,
|
||||||
points=points,
|
points=points,
|
||||||
radius_m=args.radius_m,
|
radius_m=args.radius_m,
|
||||||
tow_types=tow_types,
|
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
layer_names=layer_names,
|
layer_names=layer_names,
|
||||||
max_features_per_layer=args.max_features_per_layer,
|
max_features_per_layer=args.max_features_per_layer,
|
||||||
workers=args.workers,
|
workers=args.workers,
|
||||||
|
canopy_area=canopy_area,
|
||||||
|
feature_count=feature_count,
|
||||||
|
height_weighted_sum=height_weighted_sum,
|
||||||
|
height_weight=height_weight,
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.nfi_zip is not None and args.nfi_zip.exists():
|
||||||
|
nfi_path = _nfi_dataset_path(
|
||||||
|
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
|
||||||
|
)
|
||||||
|
circles, nfi_tree = _postcode_buffers(points, args.radius_m)
|
||||||
|
_accumulate_nfi_metrics(
|
||||||
|
dataset_path=nfi_path,
|
||||||
|
circles=circles,
|
||||||
|
tree=nfi_tree,
|
||||||
|
canopy_area=canopy_area,
|
||||||
|
feature_count=feature_count,
|
||||||
|
radius_m=args.radius_m,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
max_nfi_features=args.max_nfi_features,
|
||||||
|
)
|
||||||
|
elif args.nfi_zip is not None:
|
||||||
|
print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
|
||||||
|
|
||||||
|
postcode_metrics = _finalize_metrics(
|
||||||
|
points,
|
||||||
|
canopy_area,
|
||||||
|
feature_count,
|
||||||
|
height_weighted_sum,
|
||||||
|
height_weight,
|
||||||
|
args.radius_m,
|
||||||
)
|
)
|
||||||
postcode_metrics = _with_postcode_density_percentiles(
|
postcode_metrics = _with_postcode_density_percentiles(
|
||||||
postcode_metrics, args.radius_m
|
postcode_metrics, args.radius_m
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
|
"""Build PMTiles polygon tiles for the Trees Outside Woodland + NFI overlay."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -16,10 +16,14 @@ from pyproj import Transformer
|
||||||
|
|
||||||
from pipeline.local_temp import local_tmp_dir
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.transform.tree_density import (
|
from pipeline.transform.tree_density import (
|
||||||
DEFAULT_TOW_TYPES,
|
NFI_AREA_HA_COL,
|
||||||
|
NFI_CATEGORY_COL,
|
||||||
|
NFI_TYPE_COL,
|
||||||
|
NFI_WOODLAND_VALUE,
|
||||||
|
_geometry_column,
|
||||||
_layers,
|
_layers,
|
||||||
|
_nfi_dataset_path,
|
||||||
_tow_dataset_path,
|
_tow_dataset_path,
|
||||||
_where_for_tow_types,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -55,17 +59,13 @@ def _number_or_none(value) -> float | int | None:
|
||||||
def _write_tree_geojsonseq(
|
def _write_tree_geojsonseq(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
tow_types: tuple[str, ...],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
layer_names: tuple[str, ...] | None,
|
layer_names: tuple[str, ...] | None,
|
||||||
max_features_per_layer: int | None,
|
max_features_per_layer: int | None,
|
||||||
) -> int:
|
) -> int:
|
||||||
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||||
where = _where_for_tow_types(tow_types)
|
|
||||||
layers = _layers(dataset_path, layer_names)
|
layers = _layers(dataset_path, layer_names)
|
||||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||||
if where:
|
|
||||||
print(f"TOW type filter: {where}")
|
|
||||||
|
|
||||||
columns = [
|
columns = [
|
||||||
"TOW_ID",
|
"TOW_ID",
|
||||||
|
|
@ -88,7 +88,6 @@ def _write_tree_geojsonseq(
|
||||||
dataset_path,
|
dataset_path,
|
||||||
layer=layer,
|
layer=layer,
|
||||||
columns=columns,
|
columns=columns,
|
||||||
where=where,
|
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
use_pyarrow=True,
|
use_pyarrow=True,
|
||||||
) as (_meta, reader):
|
) as (_meta, reader):
|
||||||
|
|
@ -136,6 +135,7 @@ def _write_tree_geojsonseq(
|
||||||
|
|
||||||
for idx, geometry_json in zip(valid_indexes, geometries_json):
|
for idx, geometry_json in zip(valid_indexes, geometries_json):
|
||||||
properties = {
|
properties = {
|
||||||
|
"source": "tow",
|
||||||
"tow_id": str(tow_id[idx]) if tow_id is not None else "",
|
"tow_id": str(tow_id[idx]) if tow_id is not None else "",
|
||||||
"woodland_type": (
|
"woodland_type": (
|
||||||
str(woodland_type[idx])
|
str(woodland_type[idx])
|
||||||
|
|
@ -176,11 +176,105 @@ def _write_tree_geojsonseq(
|
||||||
return feature_count
|
return feature_count
|
||||||
|
|
||||||
|
|
||||||
|
def _append_nfi_geojsonseq(
|
||||||
|
dataset_path: str,
|
||||||
|
output_path: Path,
|
||||||
|
batch_size: int,
|
||||||
|
max_nfi_features: int | None,
|
||||||
|
) -> int:
|
||||||
|
"""Append NFI woodland polygons to the same GeoJSONSeq as the TOW features."""
|
||||||
|
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||||
|
layers = _layers(dataset_path, None)
|
||||||
|
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
|
||||||
|
|
||||||
|
columns = [NFI_CATEGORY_COL, NFI_TYPE_COL, NFI_AREA_HA_COL]
|
||||||
|
feature_count = 0
|
||||||
|
features_seen = 0
|
||||||
|
|
||||||
|
with output_path.open("a") as file:
|
||||||
|
for layer in layers:
|
||||||
|
with pyogrio.open_arrow(
|
||||||
|
dataset_path,
|
||||||
|
layer=layer,
|
||||||
|
columns=columns,
|
||||||
|
batch_size=batch_size,
|
||||||
|
use_pyarrow=True,
|
||||||
|
) as (meta, reader):
|
||||||
|
for batch in reader:
|
||||||
|
if max_nfi_features is not None:
|
||||||
|
remaining = max_nfi_features - features_seen
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
if batch.num_rows > remaining:
|
||||||
|
batch = batch.slice(0, remaining)
|
||||||
|
|
||||||
|
features_seen += batch.num_rows
|
||||||
|
names = batch.schema.names
|
||||||
|
geometry_column = _geometry_column(meta, names)
|
||||||
|
category = np.asarray(
|
||||||
|
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
|
||||||
|
zero_copy_only=False
|
||||||
|
),
|
||||||
|
dtype=object,
|
||||||
|
)
|
||||||
|
geometry = np.asarray(
|
||||||
|
batch.column(names.index(geometry_column)).to_numpy(
|
||||||
|
zero_copy_only=False
|
||||||
|
),
|
||||||
|
dtype=object,
|
||||||
|
)
|
||||||
|
valid = category == NFI_WOODLAND_VALUE
|
||||||
|
if not valid.any():
|
||||||
|
continue
|
||||||
|
|
||||||
|
woodland_type = _column_or_none(batch, names, NFI_TYPE_COL)
|
||||||
|
area_ha = _column_or_none(batch, names, NFI_AREA_HA_COL)
|
||||||
|
|
||||||
|
geometries = shapely.from_wkb(geometry[valid])
|
||||||
|
geometries = shapely.transform(
|
||||||
|
geometries,
|
||||||
|
to_wgs84.transform,
|
||||||
|
interleaved=False,
|
||||||
|
)
|
||||||
|
geometries_json = shapely.to_geojson(geometries)
|
||||||
|
valid_indexes = np.flatnonzero(valid)
|
||||||
|
|
||||||
|
for idx, geometry_json in zip(valid_indexes, geometries_json):
|
||||||
|
area_sqm = (
|
||||||
|
_number_or_none(area_ha[idx] * 10000.0)
|
||||||
|
if area_ha is not None
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
properties = {
|
||||||
|
"source": "nfi",
|
||||||
|
"tow_id": "",
|
||||||
|
"woodland_type": (
|
||||||
|
str(woodland_type[idx])
|
||||||
|
if woodland_type is not None
|
||||||
|
else ""
|
||||||
|
),
|
||||||
|
"area_sqm": area_sqm,
|
||||||
|
"mean_height_m": None,
|
||||||
|
"min_height_m": None,
|
||||||
|
"max_height_m": None,
|
||||||
|
"lidar_year": None,
|
||||||
|
"source_layer": layer,
|
||||||
|
}
|
||||||
|
feature = {
|
||||||
|
"type": "Feature",
|
||||||
|
"geometry": json.loads(geometry_json),
|
||||||
|
"properties": properties,
|
||||||
|
}
|
||||||
|
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
|
||||||
|
feature_count += 1
|
||||||
|
|
||||||
|
return feature_count
|
||||||
|
|
||||||
|
|
||||||
def build_tree_overlay_tiles(
|
def build_tree_overlay_tiles(
|
||||||
tow_zip: Path,
|
tow_zip: Path,
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
extract_dir: Path,
|
extract_dir: Path,
|
||||||
tow_types: tuple[str, ...],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
layer_names: tuple[str, ...] | None,
|
layer_names: tuple[str, ...] | None,
|
||||||
max_features_per_layer: int | None,
|
max_features_per_layer: int | None,
|
||||||
|
|
@ -188,6 +282,9 @@ def build_tree_overlay_tiles(
|
||||||
max_zoom: int,
|
max_zoom: int,
|
||||||
force_extract: bool,
|
force_extract: bool,
|
||||||
use_vsizip: bool,
|
use_vsizip: bool,
|
||||||
|
nfi_zip: Path | None = None,
|
||||||
|
nfi_extract_dir: Path = Path("property-data/nfi_woodland_england"),
|
||||||
|
max_nfi_features: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
tippecanoe = _require_tippecanoe()
|
tippecanoe = _require_tippecanoe()
|
||||||
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
|
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
|
||||||
|
|
@ -198,13 +295,26 @@ def build_tree_overlay_tiles(
|
||||||
feature_count = _write_tree_geojsonseq(
|
feature_count = _write_tree_geojsonseq(
|
||||||
dataset_path,
|
dataset_path,
|
||||||
ndjson_path,
|
ndjson_path,
|
||||||
tow_types,
|
|
||||||
batch_size,
|
batch_size,
|
||||||
layer_names,
|
layer_names,
|
||||||
max_features_per_layer,
|
max_features_per_layer,
|
||||||
)
|
)
|
||||||
print(f"Writing {feature_count:,} TOW polygon features")
|
print(f"Writing {feature_count:,} TOW polygon features")
|
||||||
|
|
||||||
|
if nfi_zip is not None and nfi_zip.exists():
|
||||||
|
nfi_path = _nfi_dataset_path(
|
||||||
|
nfi_zip, nfi_extract_dir, force_extract, use_vsizip
|
||||||
|
)
|
||||||
|
nfi_count = _append_nfi_geojsonseq(
|
||||||
|
nfi_path,
|
||||||
|
ndjson_path,
|
||||||
|
batch_size,
|
||||||
|
max_nfi_features,
|
||||||
|
)
|
||||||
|
print(f"Writing {nfi_count:,} NFI woodland polygon features")
|
||||||
|
elif nfi_zip is not None:
|
||||||
|
print(f"NFI zip not found, skipping woodland union: {nfi_zip}")
|
||||||
|
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
[
|
[
|
||||||
tippecanoe,
|
tippecanoe,
|
||||||
|
|
@ -237,26 +347,32 @@ def main() -> None:
|
||||||
default=Path("property-data/fr_tow_v1_all"),
|
default=Path("property-data/fr_tow_v1_all"),
|
||||||
help="Directory used to extract the FileGDB",
|
help="Directory used to extract the FileGDB",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--tow-type",
|
|
||||||
action="append",
|
|
||||||
dest="tow_types",
|
|
||||||
help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--batch-size", type=int, default=50_000)
|
parser.add_argument("--batch-size", type=int, default=50_000)
|
||||||
parser.add_argument("--layer", action="append", dest="layers")
|
parser.add_argument("--layer", action="append", dest="layers")
|
||||||
parser.add_argument("--max-features-per-layer", type=int)
|
parser.add_argument("--max-features-per-layer", type=int)
|
||||||
parser.add_argument("--min-zoom", type=int, default=15)
|
parser.add_argument("--min-zoom", type=int, default=12)
|
||||||
parser.add_argument("--max-zoom", type=int, default=17)
|
parser.add_argument("--max-zoom", type=int, default=17)
|
||||||
parser.add_argument("--force-extract", action="store_true")
|
parser.add_argument("--force-extract", action="store_true")
|
||||||
parser.add_argument("--use-vsizip", action="store_true")
|
parser.add_argument("--use-vsizip", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nfi-zip",
|
||||||
|
type=Path,
|
||||||
|
default=None,
|
||||||
|
help="Optional NFI woodland shapefile zip to union into the overlay",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--nfi-extract-dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("property-data/nfi_woodland_england"),
|
||||||
|
help="Directory used to extract the NFI zip",
|
||||||
|
)
|
||||||
|
parser.add_argument("--max-nfi-features", type=int)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
build_tree_overlay_tiles(
|
build_tree_overlay_tiles(
|
||||||
tow_zip=args.tow_zip,
|
tow_zip=args.tow_zip,
|
||||||
output_path=args.output,
|
output_path=args.output,
|
||||||
extract_dir=args.extract_dir,
|
extract_dir=args.extract_dir,
|
||||||
tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
|
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
layer_names=tuple(args.layers) if args.layers else None,
|
layer_names=tuple(args.layers) if args.layers else None,
|
||||||
max_features_per_layer=args.max_features_per_layer,
|
max_features_per_layer=args.max_features_per_layer,
|
||||||
|
|
@ -264,6 +380,9 @@ def main() -> None:
|
||||||
max_zoom=args.max_zoom,
|
max_zoom=args.max_zoom,
|
||||||
force_extract=args.force_extract,
|
force_extract=args.force_extract,
|
||||||
use_vsizip=args.use_vsizip,
|
use_vsizip=args.use_vsizip,
|
||||||
|
nfi_zip=args.nfi_zip,
|
||||||
|
nfi_extract_dir=args.nfi_extract_dir,
|
||||||
|
max_nfi_features=args.max_nfi_features,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -76,6 +77,24 @@ def _split_glob(spec: str) -> tuple[Path, str]:
|
||||||
return Path(base), pattern
|
return Path(base), pattern
|
||||||
|
|
||||||
|
|
||||||
|
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
|
||||||
|
if "::" not in spec:
|
||||||
|
raise argparse.ArgumentTypeError(
|
||||||
|
f"{spec!r} must use LEFT::RIGHT for {label}"
|
||||||
|
)
|
||||||
|
left, right = spec.split("::", 1)
|
||||||
|
if not left or not right:
|
||||||
|
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
|
||||||
|
return Path(left), Path(right)
|
||||||
|
|
||||||
|
|
||||||
|
def _canonical_postcode(value: object) -> str:
|
||||||
|
compact = "".join(str(value).split()).upper()
|
||||||
|
if len(compact) >= 5:
|
||||||
|
return f"{compact[:-3]} {compact[-3:]}"
|
||||||
|
return compact
|
||||||
|
|
||||||
|
|
||||||
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
|
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
|
||||||
base, pattern = _split_glob(spec)
|
base, pattern = _split_glob(spec)
|
||||||
if not base.exists():
|
if not base.exists():
|
||||||
|
|
@ -105,6 +124,79 @@ def _failures_for_zip_glob(spec: str) -> list[str]:
|
||||||
return failures
|
return failures
|
||||||
|
|
||||||
|
|
||||||
|
def _postcode_column(columns: list[str]) -> str | None:
|
||||||
|
for name in ("postcode", "Postcode", "pcds", "PCDS"):
|
||||||
|
if name in columns:
|
||||||
|
return name
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parquet_postcodes(path: Path) -> set[str]:
|
||||||
|
schema = pl.scan_parquet(path).collect_schema()
|
||||||
|
column = _postcode_column(schema.names())
|
||||||
|
if column is None:
|
||||||
|
raise ValueError(f"{path}: missing postcode column")
|
||||||
|
values = (
|
||||||
|
pl.scan_parquet(path)
|
||||||
|
.select(pl.col(column).drop_nulls().unique())
|
||||||
|
.collect()
|
||||||
|
.get_column(column)
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
|
||||||
|
|
||||||
|
|
||||||
|
def _boundary_postcodes(path: Path) -> set[str]:
|
||||||
|
units_dir = path / "units" if (path / "units").is_dir() else path
|
||||||
|
postcodes: set[str] = set()
|
||||||
|
for geojson_path in sorted(units_dir.glob("*.geojson")):
|
||||||
|
with geojson_path.open("r", encoding="utf-8") as handle:
|
||||||
|
data = json.load(handle)
|
||||||
|
for feature in data.get("features", []):
|
||||||
|
properties = feature.get("properties") or {}
|
||||||
|
value = properties.get("postcodes")
|
||||||
|
if value is not None:
|
||||||
|
postcode = _canonical_postcode(value)
|
||||||
|
if postcode:
|
||||||
|
postcodes.add(postcode)
|
||||||
|
return postcodes
|
||||||
|
|
||||||
|
|
||||||
|
def _sample(values: set[str]) -> str:
|
||||||
|
return ", ".join(sorted(values)[:10])
|
||||||
|
|
||||||
|
|
||||||
|
def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
|
||||||
|
parquet_path, boundaries_path = _split_pair(spec, "postcode boundary matching")
|
||||||
|
failures = _failures_for_parquet(parquet_path) + _failures_for_dir(boundaries_path)
|
||||||
|
if failures:
|
||||||
|
return failures
|
||||||
|
|
||||||
|
try:
|
||||||
|
parquet_postcodes = _parquet_postcodes(parquet_path)
|
||||||
|
boundary_postcodes = _boundary_postcodes(boundaries_path)
|
||||||
|
except Exception as exc:
|
||||||
|
return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
|
||||||
|
|
||||||
|
failures = []
|
||||||
|
if not boundary_postcodes:
|
||||||
|
failures.append(f"{boundaries_path}: no boundary postcodes found")
|
||||||
|
|
||||||
|
missing_boundaries = parquet_postcodes - boundary_postcodes
|
||||||
|
orphan_boundaries = boundary_postcodes - parquet_postcodes
|
||||||
|
if missing_boundaries:
|
||||||
|
failures.append(
|
||||||
|
f"{boundaries_path}: {len(missing_boundaries):,} postcodes from {parquet_path} "
|
||||||
|
f"are missing boundaries; sample: {_sample(missing_boundaries)}"
|
||||||
|
)
|
||||||
|
if orphan_boundaries:
|
||||||
|
failures.append(
|
||||||
|
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are absent from "
|
||||||
|
f"{parquet_path}; sample: {_sample(orphan_boundaries)}"
|
||||||
|
)
|
||||||
|
return failures
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
def main() -> int:
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
parser.add_argument("--file", action="append", default=[], type=Path)
|
parser.add_argument("--file", action="append", default=[], type=Path)
|
||||||
|
|
@ -123,6 +215,12 @@ def main() -> int:
|
||||||
default=[],
|
default=[],
|
||||||
help="Require at least one readable zip matching BASE::PATTERN",
|
help="Require at least one readable zip matching BASE::PATTERN",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--postcode-boundary-match",
|
||||||
|
action="append",
|
||||||
|
default=[],
|
||||||
|
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
failures: list[str] = []
|
failures: list[str] = []
|
||||||
|
|
@ -138,6 +236,8 @@ def main() -> int:
|
||||||
failures.extend(_failures_for_glob(spec))
|
failures.extend(_failures_for_glob(spec))
|
||||||
for spec in args.zip_glob:
|
for spec in args.zip_glob:
|
||||||
failures.extend(_failures_for_zip_glob(spec))
|
failures.extend(_failures_for_zip_glob(spec))
|
||||||
|
for spec in args.postcode_boundary_match:
|
||||||
|
failures.extend(_failures_for_postcode_boundary_match(spec))
|
||||||
|
|
||||||
if failures:
|
if failures:
|
||||||
print("Output validation failed:", file=sys.stderr)
|
print("Output validation failed:", file=sys.stderr)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue