perfect-postcode/pipeline/download/satellite_highres.py
2026-06-06 10:45:45 +01:00

528 lines
19 KiB
Python

"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
British National Grid. There is no public imagery tile service, so we mirror the
Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
an area of interest, pick the best RGB capture per OS tile, download and decode the
ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
archive that the server stacks *over* the Sentinel-2 base where coverage exists.
ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
"""
from __future__ import annotations
import argparse
import json
import re
import shutil
import sqlite3
import subprocess
import tempfile
import urllib.error
import urllib.request
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from pipeline.download.tiles import ensure_pmtiles_cli
from pipeline.local_temp import local_tmp_dir
# Defra Data Services Platform survey download API (reverse-engineered from the
# environment.data.gov.uk/survey front-end; no official API is documented).
SEARCH_URL = (
"https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
)
SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
# Static public key baked into the survey page JS. May rotate -- we try to scrape a
# fresh one from the page and only fall back to this literal.
DEFAULT_SUBSCRIPTION_KEY = "dspui"
SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
# Greater London bounding box (lon/lat). The API only returns tiles where coverage
# exists, so a generous bbox is fine -- it does not force blank downloads.
DEFAULT_AOI: dict = {
"type": "Polygon",
"coordinates": [
[
[-0.55, 51.25],
[0.30, 51.25],
[0.30, 51.70],
[-0.55, 51.70],
[-0.55, 51.25],
]
],
}
DEFAULT_MIN_ZOOM = 14
DEFAULT_MAX_ZOOM = 19
# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
USER_AGENT = "perfect-postcode-satellite-highres/1.0"
ATTRIBUTION_TEMPLATE = (
"Environment Agency Vertical Aerial Photography - "
"© Environment Agency copyright and/or database right {year}. "
"All rights reserved. Licensed under the Open Government Licence v3.0."
)
@dataclass(frozen=True)
class VapTile:
"""One survey download record from the Defra search API."""
product_id: str
year: int
resolution_m: float
os_tile_id: str
uri: str
label: str
def parse_search_results(payload: dict) -> list[VapTile]:
"""Turn a raw search-API JSON payload into typed records."""
tiles: list[VapTile] = []
for result in payload.get("results", []):
try:
tiles.append(
VapTile(
product_id=result["product"]["id"],
year=int(result["year"]["id"]),
resolution_m=float(result["resolution"]["id"]),
os_tile_id=result["tile"]["id"],
uri=result["uri"],
label=result.get("label", ""),
)
)
except (KeyError, TypeError, ValueError):
# Skip malformed records rather than failing the whole search.
continue
return tiles
def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
"""Pick one RGB capture per OS tile: finest resolution, then latest year.
Pure function -- the unit test exercises this against a real-shaped payload.
"""
best: dict[str, VapTile] = {}
for tile in tiles:
if tile.product_id != VAP_RGB_PRODUCT:
continue
current = best.get(tile.os_tile_id)
if current is None or _is_better(tile, current):
best[tile.os_tile_id] = tile
return [best[key] for key in sorted(best)]
def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
"""Finer resolution wins; ties broken by the most recent survey year."""
if candidate.resolution_m != incumbent.resolution_m:
return candidate.resolution_m < incumbent.resolution_m
return candidate.year > incumbent.year
def _http_get(url: str, timeout: float) -> bytes:
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read()
def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
"""Use an explicit key, else scrape the survey page JS, else the known default."""
if explicit:
return explicit
try:
page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
match = SUBSCRIPTION_KEY_RE.search(page)
if match:
return match.group(1)
# The key usually lives in a referenced JS chunk; scan the largest one.
for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
if match:
return match.group(1)
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
print(f"Could not scrape subscription key ({err}); using default", flush=True)
return DEFAULT_SUBSCRIPTION_KEY
def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
"""POST the area-of-interest polygon and return the RGB tiles to download."""
body = json.dumps(aoi).encode("utf-8")
req = urllib.request.Request(
SEARCH_URL,
data=body,
headers={
"Content-Type": "application/geo+json",
"Referer": SURVEY_PAGE_URL,
"User-Agent": USER_AGENT,
},
method="POST",
)
with urllib.request.urlopen(req, timeout=timeout) as response:
payload = json.load(response)
selected = select_best_rgb_tiles(parse_search_results(payload))
print(
f"Search returned {payload.get('count', 0)} records; "
f"selected {len(selected)} RGB tile(s)",
flush=True,
)
return selected
def _download_and_extract(
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
) -> list[Path]:
"""Download one survey zip and extract its ECW raster(s).
The Defra endpoint sometimes answers a download with a 200 whose body is an
error/HTML page or a truncated stream rather than a ZIP (rate limiting, a
rotated key, a transient backend hiccup). We treat a non-ZIP body the same as
a network error -- retry it -- and, if every attempt fails, skip just this
tile rather than aborting the whole multi-tile run: missing coverage simply
shows the Sentinel-2 base through, and the caller still hard-fails if *no*
ECWs were extracted at all.
"""
url = f"{tile.uri}?subscription-key={key}"
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
last_err: Exception | None = None
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(
urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
timeout=timeout,
) as response, zip_path.open("wb") as out:
shutil.copyfileobj(response, out, length=1 << 20)
return _extract_ecws(zip_path, tile, ecw_dir)
except (
urllib.error.URLError,
TimeoutError,
ConnectionError,
zipfile.BadZipFile,
) as err:
last_err = err
zip_path.unlink(missing_ok=True)
print(
f" {tile.os_tile_id}: download failed after {retries + 1} attempt(s), "
f"skipping ({last_err})",
flush=True,
)
return []
def _extract_ecws(zip_path: Path, tile: VapTile, ecw_dir: Path) -> list[Path]:
"""Extract every ECW raster from a downloaded survey zip; delete the zip."""
extracted: list[Path] = []
with zipfile.ZipFile(zip_path) as archive:
for member in archive.infolist():
if member.is_dir() or not member.filename.lower().endswith(".ecw"):
continue
target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
with archive.open(member) as src, target.open("wb") as dst:
shutil.copyfileobj(src, dst, length=1 << 20)
extracted.append(target)
zip_path.unlink(missing_ok=True)
if not extracted:
print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
return extracted
def download_tiles(
tiles: list[VapTile],
ecw_dir: Path,
key: str,
max_workers: int,
timeout: float,
retries: int,
) -> list[Path]:
"""Download every selected tile concurrently; return all extracted ECW paths."""
ecw_dir.mkdir(parents=True, exist_ok=True)
ecw_paths: list[Path] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(
_download_and_extract, tile, ecw_dir, key, timeout, retries
): tile
for tile in tiles
}
done = 0
for future in as_completed(futures):
tile = futures[future]
ecw_paths.extend(future.result())
done += 1
print(
f"Downloaded {done}/{len(tiles)} tiles "
f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
flush=True,
)
return ecw_paths
def _build_tiles_with_gdal(
work_dir: Path,
gdal_image: str,
min_zoom: int,
max_zoom: int,
jobs: int,
webp_quality: int,
) -> Path:
"""Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
shows through them.
"""
xyz_dir = work_dir / "xyz"
# EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
# so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
# exceed PNG's 4-band limit). We then:
# * force EPSG:27700 -- the pixels are already British National Grid, and the
# EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
# for metre-accurate reprojection to Web Mercator;
# * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
# gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
# Sentinel-2 base show through wherever VAP coverage is missing.
script = (
"set -euo pipefail; "
"cd /work; "
"gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
"gdal_edit.py -a_srs EPSG:27700 "
"-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
"-colorinterp_4 alpha mosaic.vrt; "
f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
f"--processes={jobs} --resampling=average --webviewer=none "
f"--tiledriver=WEBP --webp-quality={webp_quality} "
"mosaic.vrt xyz"
)
subprocess.run(
[
"docker",
"run",
"--rm",
"-v",
f"{work_dir.resolve()}:/work",
gdal_image,
"bash",
"-c",
script,
],
check=True,
)
if not xyz_dir.exists():
raise RuntimeError("gdal2tiles produced no output directory")
return xyz_dir
def _pack_xyz_to_mbtiles(
xyz_dir: Path,
mbtiles_path: Path,
bounds: tuple[float, float, float, float],
min_zoom: int,
max_zoom: int,
attribution: str,
) -> int:
"""Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
if mbtiles_path.exists():
mbtiles_path.unlink()
conn = sqlite3.connect(mbtiles_path)
try:
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
conn.execute(
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
"tile_row INTEGER, tile_data BLOB)"
)
conn.execute(
"CREATE UNIQUE INDEX tile_index ON tiles "
"(zoom_level, tile_column, tile_row)"
)
conn.executemany(
"INSERT INTO metadata (name, value) VALUES (?, ?)",
[
("name", "EA Vertical Aerial Photography"),
("type", "overlay"),
("version", "1"),
("description", "Environment Agency high-resolution aerial imagery"),
("format", "webp"),
("attribution", attribution),
("bounds", ",".join(f"{value:.6f}" for value in bounds)),
("minzoom", str(min_zoom)),
("maxzoom", str(max_zoom)),
],
)
inserted = 0
for zoom_dir in sorted(xyz_dir.iterdir()):
if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
continue
zoom = int(zoom_dir.name)
for col_dir in zoom_dir.iterdir():
if not col_dir.is_dir() or not col_dir.name.isdigit():
continue
col = int(col_dir.name)
for tile_file in col_dir.glob("*.webp"):
if not tile_file.stem.isdigit():
continue
row = int(tile_file.stem)
tms_row = (1 << zoom) - 1 - row
conn.execute(
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
(zoom, col, tms_row, tile_file.read_bytes()),
)
inserted += 1
if inserted % 5000 == 0:
conn.commit()
print(f" packed {inserted:,} tiles", flush=True)
conn.commit()
finally:
conn.close()
return inserted
def build_satellite_highres_tiles(
output_path: Path,
pmtiles_bin: Path,
pmtiles_version: str,
aoi: dict,
min_zoom: int,
max_zoom: int,
gdal_image: str,
subscription_key: str | None,
max_workers: int,
timeout: float,
retries: int,
jobs: int,
webp_quality: int,
) -> None:
if min_zoom > max_zoom:
raise ValueError("--min-zoom must be <= --max-zoom")
output_path.parent.mkdir(parents=True, exist_ok=True)
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
tiles = search_vap_tiles(aoi)
if not tiles:
raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
key = resolve_subscription_key(subscription_key)
attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
work_dir = Path(tmp)
ecw_dir = work_dir / "ecw"
ecw_paths = download_tiles(
tiles, ecw_dir, key, max_workers, timeout, retries
)
if not ecw_paths:
raise RuntimeError("No ECW rasters were extracted from the downloads")
xyz_dir = _build_tiles_with_gdal(
work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
)
mbtiles_path = work_dir / "satellite_highres.mbtiles"
bounds = _aoi_bounds(aoi)
inserted = _pack_xyz_to_mbtiles(
xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
)
if inserted == 0:
raise RuntimeError("Tiling produced no tiles to pack")
print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
subprocess.run(
[str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
check=True,
)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
coords = [point for ring in aoi["coordinates"] for point in ring]
lons = [point[0] for point in coords]
lats = [point[1] for point in coords]
return min(lons), min(lats), max(lons), max(lats)
def _load_aoi(path: Path | None) -> dict:
if path is None:
return DEFAULT_AOI
data = json.loads(path.read_text())
if data.get("type") == "FeatureCollection":
return data["features"][0]["geometry"]
if data.get("type") == "Feature":
return data["geometry"]
return data
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
parser.add_argument("--pmtiles-version", default="1.22.3")
parser.add_argument(
"--aoi-geojson",
type=Path,
default=None,
help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
"(default: Greater London)",
)
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
parser.add_argument(
"--gdal-image",
default=DEFAULT_GDAL_IMAGE,
help="Docker image with a GDAL that has the ECW driver",
)
parser.add_argument(
"--subscription-key",
default=None,
help="Override the Defra survey API key (default: scrape, then 'dspui')",
)
parser.add_argument("--max-workers", type=int, default=4)
parser.add_argument("--timeout", type=float, default=600.0)
parser.add_argument("--retries", type=int, default=3)
parser.add_argument(
"--jobs",
type=int,
default=8,
help="Parallel processes for gdal2tiles",
)
parser.add_argument(
"--webp-quality",
type=int,
default=85,
help="WebP tile quality (1-100); lower is smaller",
)
args = parser.parse_args()
build_satellite_highres_tiles(
output_path=args.output,
pmtiles_bin=args.pmtiles_bin,
pmtiles_version=args.pmtiles_version,
aoi=_load_aoi(args.aoi_geojson),
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
gdal_image=args.gdal_image,
subscription_key=args.subscription_key,
max_workers=max(1, args.max_workers),
timeout=args.timeout,
retries=max(0, args.retries),
jobs=max(1, args.jobs),
webp_quality=args.webp_quality,
)
if __name__ == "__main__":
main()