"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography. The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open (OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the British National Grid. There is no public imagery tile service, so we mirror the Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for an area of interest, pick the best RGB capture per OS tile, download and decode the ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles archive that the server stacks *over* the Sentinel-2 base where coverage exists. ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2 SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the rest of the pipeline is plain Python plus the ``pmtiles`` CLI. """ from __future__ import annotations import argparse import json import re import shutil import sqlite3 import subprocess import tempfile import urllib.error import urllib.request import zipfile from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from pathlib import Path from pipeline.download.tiles import ensure_pmtiles_cli from pipeline.local_temp import local_tmp_dir # Defra Data Services Platform survey download API (reverse-engineered from the # environment.data.gov.uk/survey front-end; no official API is documented). SEARCH_URL = ( "https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search" ) SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey" # Static public key baked into the survey page JS. May rotate -- we try to scrape a # fresh one from the page and only fall back to this literal. DEFAULT_SUBSCRIPTION_KEY = "dspui" SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)") # True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants). VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb" # Greater London bounding box (lon/lat). The API only returns tiles where coverage # exists, so a generous bbox is fine -- it does not force blank downloads. DEFAULT_AOI: dict = { "type": "Polygon", "coordinates": [ [ [-0.55, 51.25], [0.30, 51.25], [0.30, 51.70], [-0.55, 51.70], [-0.55, 51.25], ] ], } DEFAULT_MIN_ZOOM = 14 DEFAULT_MAX_ZOOM = 19 # GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so # this defaults to the locally-built image from docker/gdal-ecw/Dockerfile. DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest" USER_AGENT = "perfect-postcode-satellite-highres/1.0" ATTRIBUTION_TEMPLATE = ( "Environment Agency Vertical Aerial Photography - " "© Environment Agency copyright and/or database right {year}. " "All rights reserved. Licensed under the Open Government Licence v3.0." ) @dataclass(frozen=True) class VapTile: """One survey download record from the Defra search API.""" product_id: str year: int resolution_m: float os_tile_id: str uri: str label: str def parse_search_results(payload: dict) -> list[VapTile]: """Turn a raw search-API JSON payload into typed records.""" tiles: list[VapTile] = [] for result in payload.get("results", []): try: tiles.append( VapTile( product_id=result["product"]["id"], year=int(result["year"]["id"]), resolution_m=float(result["resolution"]["id"]), os_tile_id=result["tile"]["id"], uri=result["uri"], label=result.get("label", ""), ) ) except (KeyError, TypeError, ValueError): # Skip malformed records rather than failing the whole search. continue return tiles def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]: """Pick one RGB capture per OS tile: finest resolution, then latest year. Pure function -- the unit test exercises this against a real-shaped payload. """ best: dict[str, VapTile] = {} for tile in tiles: if tile.product_id != VAP_RGB_PRODUCT: continue current = best.get(tile.os_tile_id) if current is None or _is_better(tile, current): best[tile.os_tile_id] = tile return [best[key] for key in sorted(best)] def _is_better(candidate: VapTile, incumbent: VapTile) -> bool: """Finer resolution wins; ties broken by the most recent survey year.""" if candidate.resolution_m != incumbent.resolution_m: return candidate.resolution_m < incumbent.resolution_m return candidate.year > incumbent.year def _http_get(url: str, timeout: float) -> bytes: req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) with urllib.request.urlopen(req, timeout=timeout) as response: return response.read() def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str: """Use an explicit key, else scrape the survey page JS, else the known default.""" if explicit: return explicit try: page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore") match = SUBSCRIPTION_KEY_RE.search(page) if match: return match.group(1) # The key usually lives in a referenced JS chunk; scan the largest one. for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page): js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout) match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore")) if match: return match.group(1) except (urllib.error.URLError, TimeoutError, ConnectionError) as err: print(f"Could not scrape subscription key ({err}); using default", flush=True) return DEFAULT_SUBSCRIPTION_KEY def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]: """POST the area-of-interest polygon and return the RGB tiles to download.""" body = json.dumps(aoi).encode("utf-8") req = urllib.request.Request( SEARCH_URL, data=body, headers={ "Content-Type": "application/geo+json", "Referer": SURVEY_PAGE_URL, "User-Agent": USER_AGENT, }, method="POST", ) with urllib.request.urlopen(req, timeout=timeout) as response: payload = json.load(response) selected = select_best_rgb_tiles(parse_search_results(payload)) print( f"Search returned {payload.get('count', 0)} records; " f"selected {len(selected)} RGB tile(s)", flush=True, ) return selected def _download_and_extract( tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int ) -> list[Path]: """Download one survey zip and extract its ECW raster(s). The Defra endpoint sometimes answers a download with a 200 whose body is an error/HTML page or a truncated stream rather than a ZIP (rate limiting, a rotated key, a transient backend hiccup). We treat a non-ZIP body the same as a network error -- retry it -- and, if every attempt fails, skip just this tile rather than aborting the whole multi-tile run: missing coverage simply shows the Sentinel-2 base through, and the caller still hard-fails if *no* ECWs were extracted at all. """ url = f"{tile.uri}?subscription-key={key}" zip_path = ecw_dir / f"{tile.os_tile_id}.zip" last_err: Exception | None = None for attempt in range(retries + 1): try: with urllib.request.urlopen( urllib.request.Request(url, headers={"User-Agent": USER_AGENT}), timeout=timeout, ) as response, zip_path.open("wb") as out: shutil.copyfileobj(response, out, length=1 << 20) return _extract_ecws(zip_path, tile, ecw_dir) except ( urllib.error.URLError, TimeoutError, ConnectionError, zipfile.BadZipFile, ) as err: last_err = err zip_path.unlink(missing_ok=True) print( f" {tile.os_tile_id}: download failed after {retries + 1} attempt(s), " f"skipping ({last_err})", flush=True, ) return [] def _extract_ecws(zip_path: Path, tile: VapTile, ecw_dir: Path) -> list[Path]: """Extract every ECW raster from a downloaded survey zip; delete the zip.""" extracted: list[Path] = [] with zipfile.ZipFile(zip_path) as archive: for member in archive.infolist(): if member.is_dir() or not member.filename.lower().endswith(".ecw"): continue target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}" with archive.open(member) as src, target.open("wb") as dst: shutil.copyfileobj(src, dst, length=1 << 20) extracted.append(target) zip_path.unlink(missing_ok=True) if not extracted: print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True) return extracted def download_tiles( tiles: list[VapTile], ecw_dir: Path, key: str, max_workers: int, timeout: float, retries: int, ) -> list[Path]: """Download every selected tile concurrently; return all extracted ECW paths.""" ecw_dir.mkdir(parents=True, exist_ok=True) ecw_paths: list[Path] = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = { executor.submit( _download_and_extract, tile, ecw_dir, key, timeout, retries ): tile for tile in tiles } done = 0 for future in as_completed(futures): tile = futures[future] ecw_paths.extend(future.result()) done += 1 print( f"Downloaded {done}/{len(tiles)} tiles " f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})", flush=True, ) return ecw_paths def _build_tiles_with_gdal( work_dir: Path, gdal_image: str, min_zoom: int, max_zoom: int, jobs: int, webp_quality: int, ) -> Path: """Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image. Returns the host path of the generated ``xyz`` directory. We use lossy WebP with an alpha channel: ~6x smaller than lossless PNG for photographic imagery while keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base shows through them. """ xyz_dir = work_dir / "xyz" # EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask), # so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and # exceed PNG's 4-band limit). We then: # * force EPSG:27700 -- the pixels are already British National Grid, and the # EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image) # for metre-accurate reprojection to Web Mercator; # * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block # gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the # Sentinel-2 base show through wherever VAP coverage is missing. script = ( "set -euo pipefail; " "cd /work; " "gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; " "gdal_edit.py -a_srs EPSG:27700 " "-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue " "-colorinterp_4 alpha mosaic.vrt; " f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} " f"--processes={jobs} --resampling=average --webviewer=none " f"--tiledriver=WEBP --webp-quality={webp_quality} " "mosaic.vrt xyz" ) subprocess.run( [ "docker", "run", "--rm", "-v", f"{work_dir.resolve()}:/work", gdal_image, "bash", "-c", script, ], check=True, ) if not xyz_dir.exists(): raise RuntimeError("gdal2tiles produced no output directory") return xyz_dir def _pack_xyz_to_mbtiles( xyz_dir: Path, mbtiles_path: Path, bounds: tuple[float, float, float, float], min_zoom: int, max_zoom: int, attribution: str, ) -> int: """Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows).""" if mbtiles_path.exists(): mbtiles_path.unlink() conn = sqlite3.connect(mbtiles_path) try: conn.execute("PRAGMA journal_mode = WAL") conn.execute("PRAGMA synchronous = NORMAL") conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)") conn.execute( "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, " "tile_row INTEGER, tile_data BLOB)" ) conn.execute( "CREATE UNIQUE INDEX tile_index ON tiles " "(zoom_level, tile_column, tile_row)" ) conn.executemany( "INSERT INTO metadata (name, value) VALUES (?, ?)", [ ("name", "EA Vertical Aerial Photography"), ("type", "overlay"), ("version", "1"), ("description", "Environment Agency high-resolution aerial imagery"), ("format", "webp"), ("attribution", attribution), ("bounds", ",".join(f"{value:.6f}" for value in bounds)), ("minzoom", str(min_zoom)), ("maxzoom", str(max_zoom)), ], ) inserted = 0 for zoom_dir in sorted(xyz_dir.iterdir()): if not zoom_dir.is_dir() or not zoom_dir.name.isdigit(): continue zoom = int(zoom_dir.name) for col_dir in zoom_dir.iterdir(): if not col_dir.is_dir() or not col_dir.name.isdigit(): continue col = int(col_dir.name) for tile_file in col_dir.glob("*.webp"): if not tile_file.stem.isdigit(): continue row = int(tile_file.stem) tms_row = (1 << zoom) - 1 - row conn.execute( "INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)", (zoom, col, tms_row, tile_file.read_bytes()), ) inserted += 1 if inserted % 5000 == 0: conn.commit() print(f" packed {inserted:,} tiles", flush=True) conn.commit() finally: conn.close() return inserted def build_satellite_highres_tiles( output_path: Path, pmtiles_bin: Path, pmtiles_version: str, aoi: dict, min_zoom: int, max_zoom: int, gdal_image: str, subscription_key: str | None, max_workers: int, timeout: float, retries: int, jobs: int, webp_quality: int, ) -> None: if min_zoom > max_zoom: raise ValueError("--min-zoom must be <= --max-zoom") output_path.parent.mkdir(parents=True, exist_ok=True) ensure_pmtiles_cli(pmtiles_bin, pmtiles_version) tiles = search_vap_tiles(aoi) if not tiles: raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI") key = resolve_subscription_key(subscription_key) attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles)) with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: work_dir = Path(tmp) ecw_dir = work_dir / "ecw" ecw_paths = download_tiles( tiles, ecw_dir, key, max_workers, timeout, retries ) if not ecw_paths: raise RuntimeError("No ECW rasters were extracted from the downloads") xyz_dir = _build_tiles_with_gdal( work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality ) mbtiles_path = work_dir / "satellite_highres.mbtiles" bounds = _aoi_bounds(aoi) inserted = _pack_xyz_to_mbtiles( xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution ) if inserted == 0: raise RuntimeError("Tiling produced no tiles to pack") print(f"Packed {inserted:,} tiles into MBTiles", flush=True) subprocess.run( [str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"], check=True, ) size_mb = output_path.stat().st_size / (1024 * 1024) print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True) def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]: coords = [point for ring in aoi["coordinates"] for point in ring] lons = [point[0] for point in coords] lats = [point[1] for point in coords] return min(lons), min(lats), max(lons), max(lats) def _load_aoi(path: Path | None) -> dict: if path is None: return DEFAULT_AOI data = json.loads(path.read_text()) if data.get("type") == "FeatureCollection": return data["features"][0]["geometry"] if data.get("type") == "Feature": return data["geometry"] return data def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--output", type=Path, required=True) parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")) parser.add_argument("--pmtiles-version", default="1.22.3") parser.add_argument( "--aoi-geojson", type=Path, default=None, help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest " "(default: Greater London)", ) parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM) parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM) parser.add_argument( "--gdal-image", default=DEFAULT_GDAL_IMAGE, help="Docker image with a GDAL that has the ECW driver", ) parser.add_argument( "--subscription-key", default=None, help="Override the Defra survey API key (default: scrape, then 'dspui')", ) parser.add_argument("--max-workers", type=int, default=4) parser.add_argument("--timeout", type=float, default=600.0) parser.add_argument("--retries", type=int, default=3) parser.add_argument( "--jobs", type=int, default=8, help="Parallel processes for gdal2tiles", ) parser.add_argument( "--webp-quality", type=int, default=85, help="WebP tile quality (1-100); lower is smaller", ) args = parser.parse_args() build_satellite_highres_tiles( output_path=args.output, pmtiles_bin=args.pmtiles_bin, pmtiles_version=args.pmtiles_version, aoi=_load_aoi(args.aoi_geojson), min_zoom=args.min_zoom, max_zoom=args.max_zoom, gdal_image=args.gdal_image, subscription_key=args.subscription_key, max_workers=max(1, args.max_workers), timeout=args.timeout, retries=max(0, args.retries), jobs=max(1, args.jobs), webp_quality=args.webp_quality, ) if __name__ == "__main__": main()