505 lines
18 KiB
Python
505 lines
18 KiB
Python
"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
|
|
|
|
The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
|
|
(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
|
|
British National Grid. There is no public imagery tile service, so we mirror the
|
|
Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
|
|
an area of interest, pick the best RGB capture per OS tile, download and decode the
|
|
ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
|
|
archive that the server stacks *over* the Sentinel-2 base where coverage exists.
|
|
|
|
ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
|
|
SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
|
|
runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
|
|
rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import shutil
|
|
import sqlite3
|
|
import subprocess
|
|
import tempfile
|
|
import urllib.error
|
|
import urllib.request
|
|
import zipfile
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from pipeline.download.tiles import ensure_pmtiles_cli
|
|
from pipeline.local_temp import local_tmp_dir
|
|
|
|
# Defra Data Services Platform survey download API (reverse-engineered from the
|
|
# environment.data.gov.uk/survey front-end; no official API is documented).
|
|
SEARCH_URL = (
|
|
"https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
|
|
)
|
|
SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
|
|
# Static public key baked into the survey page JS. May rotate -- we try to scrape a
|
|
# fresh one from the page and only fall back to this literal.
|
|
DEFAULT_SUBSCRIPTION_KEY = "dspui"
|
|
SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
|
|
|
|
# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
|
|
VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
|
|
|
|
# Greater London bounding box (lon/lat). The API only returns tiles where coverage
|
|
# exists, so a generous bbox is fine -- it does not force blank downloads.
|
|
DEFAULT_AOI: dict = {
|
|
"type": "Polygon",
|
|
"coordinates": [
|
|
[
|
|
[-0.55, 51.25],
|
|
[0.30, 51.25],
|
|
[0.30, 51.70],
|
|
[-0.55, 51.70],
|
|
[-0.55, 51.25],
|
|
]
|
|
],
|
|
}
|
|
|
|
DEFAULT_MIN_ZOOM = 14
|
|
DEFAULT_MAX_ZOOM = 19
|
|
# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
|
|
# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
|
|
DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
|
|
USER_AGENT = "perfect-postcode-satellite-highres/1.0"
|
|
ATTRIBUTION_TEMPLATE = (
|
|
"Environment Agency Vertical Aerial Photography - "
|
|
"© Environment Agency copyright and/or database right {year}. "
|
|
"All rights reserved. Licensed under the Open Government Licence v3.0."
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VapTile:
|
|
"""One survey download record from the Defra search API."""
|
|
|
|
product_id: str
|
|
year: int
|
|
resolution_m: float
|
|
os_tile_id: str
|
|
uri: str
|
|
label: str
|
|
|
|
|
|
def parse_search_results(payload: dict) -> list[VapTile]:
|
|
"""Turn a raw search-API JSON payload into typed records."""
|
|
tiles: list[VapTile] = []
|
|
for result in payload.get("results", []):
|
|
try:
|
|
tiles.append(
|
|
VapTile(
|
|
product_id=result["product"]["id"],
|
|
year=int(result["year"]["id"]),
|
|
resolution_m=float(result["resolution"]["id"]),
|
|
os_tile_id=result["tile"]["id"],
|
|
uri=result["uri"],
|
|
label=result.get("label", ""),
|
|
)
|
|
)
|
|
except (KeyError, TypeError, ValueError):
|
|
# Skip malformed records rather than failing the whole search.
|
|
continue
|
|
return tiles
|
|
|
|
|
|
def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
|
|
"""Pick one RGB capture per OS tile: finest resolution, then latest year.
|
|
|
|
Pure function -- the unit test exercises this against a real-shaped payload.
|
|
"""
|
|
best: dict[str, VapTile] = {}
|
|
for tile in tiles:
|
|
if tile.product_id != VAP_RGB_PRODUCT:
|
|
continue
|
|
current = best.get(tile.os_tile_id)
|
|
if current is None or _is_better(tile, current):
|
|
best[tile.os_tile_id] = tile
|
|
return [best[key] for key in sorted(best)]
|
|
|
|
|
|
def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
|
|
"""Finer resolution wins; ties broken by the most recent survey year."""
|
|
if candidate.resolution_m != incumbent.resolution_m:
|
|
return candidate.resolution_m < incumbent.resolution_m
|
|
return candidate.year > incumbent.year
|
|
|
|
|
|
def _http_get(url: str, timeout: float) -> bytes:
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
return response.read()
|
|
|
|
|
|
def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
|
|
"""Use an explicit key, else scrape the survey page JS, else the known default."""
|
|
if explicit:
|
|
return explicit
|
|
try:
|
|
page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
|
|
match = SUBSCRIPTION_KEY_RE.search(page)
|
|
if match:
|
|
return match.group(1)
|
|
# The key usually lives in a referenced JS chunk; scan the largest one.
|
|
for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
|
|
js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
|
|
match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
|
|
if match:
|
|
return match.group(1)
|
|
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
|
print(f"Could not scrape subscription key ({err}); using default", flush=True)
|
|
return DEFAULT_SUBSCRIPTION_KEY
|
|
|
|
|
|
def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
|
|
"""POST the area-of-interest polygon and return the RGB tiles to download."""
|
|
body = json.dumps(aoi).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
SEARCH_URL,
|
|
data=body,
|
|
headers={
|
|
"Content-Type": "application/geo+json",
|
|
"Referer": SURVEY_PAGE_URL,
|
|
"User-Agent": USER_AGENT,
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
payload = json.load(response)
|
|
selected = select_best_rgb_tiles(parse_search_results(payload))
|
|
print(
|
|
f"Search returned {payload.get('count', 0)} records; "
|
|
f"selected {len(selected)} RGB tile(s)",
|
|
flush=True,
|
|
)
|
|
return selected
|
|
|
|
|
|
def _download_and_extract(
|
|
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
|
|
) -> list[Path]:
|
|
"""Download one survey zip and extract its ECW raster(s)."""
|
|
url = f"{tile.uri}?subscription-key={key}"
|
|
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
|
|
last_error: Exception | None = None
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
with urllib.request.urlopen(
|
|
urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
|
|
timeout=timeout,
|
|
) as response, zip_path.open("wb") as out:
|
|
shutil.copyfileobj(response, out, length=1 << 20)
|
|
break
|
|
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
|
last_error = err
|
|
if attempt == retries:
|
|
raise RuntimeError(f"Failed to download {url}: {err}") from err
|
|
extracted: list[Path] = []
|
|
with zipfile.ZipFile(zip_path) as archive:
|
|
for member in archive.infolist():
|
|
if member.is_dir() or not member.filename.lower().endswith(".ecw"):
|
|
continue
|
|
target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
|
|
with archive.open(member) as src, target.open("wb") as dst:
|
|
shutil.copyfileobj(src, dst, length=1 << 20)
|
|
extracted.append(target)
|
|
zip_path.unlink(missing_ok=True)
|
|
if not extracted:
|
|
print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
|
|
return extracted
|
|
|
|
|
|
def download_tiles(
|
|
tiles: list[VapTile],
|
|
ecw_dir: Path,
|
|
key: str,
|
|
max_workers: int,
|
|
timeout: float,
|
|
retries: int,
|
|
) -> list[Path]:
|
|
"""Download every selected tile concurrently; return all extracted ECW paths."""
|
|
ecw_dir.mkdir(parents=True, exist_ok=True)
|
|
ecw_paths: list[Path] = []
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = {
|
|
executor.submit(
|
|
_download_and_extract, tile, ecw_dir, key, timeout, retries
|
|
): tile
|
|
for tile in tiles
|
|
}
|
|
done = 0
|
|
for future in as_completed(futures):
|
|
tile = futures[future]
|
|
ecw_paths.extend(future.result())
|
|
done += 1
|
|
print(
|
|
f"Downloaded {done}/{len(tiles)} tiles "
|
|
f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
|
|
flush=True,
|
|
)
|
|
return ecw_paths
|
|
|
|
|
|
def _build_tiles_with_gdal(
|
|
work_dir: Path,
|
|
gdal_image: str,
|
|
min_zoom: int,
|
|
max_zoom: int,
|
|
jobs: int,
|
|
webp_quality: int,
|
|
) -> Path:
|
|
"""Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
|
|
|
|
Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
|
|
an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
|
|
keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
|
|
shows through them.
|
|
"""
|
|
xyz_dir = work_dir / "xyz"
|
|
# EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
|
|
# so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
|
|
# exceed PNG's 4-band limit). We then:
|
|
# * force EPSG:27700 -- the pixels are already British National Grid, and the
|
|
# EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
|
|
# for metre-accurate reprojection to Web Mercator;
|
|
# * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
|
|
# gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
|
|
# Sentinel-2 base show through wherever VAP coverage is missing.
|
|
script = (
|
|
"set -euo pipefail; "
|
|
"cd /work; "
|
|
"gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
|
|
"gdal_edit.py -a_srs EPSG:27700 "
|
|
"-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
|
|
"-colorinterp_4 alpha mosaic.vrt; "
|
|
f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
|
|
f"--processes={jobs} --resampling=average --webviewer=none "
|
|
f"--tiledriver=WEBP --webp-quality={webp_quality} "
|
|
"mosaic.vrt xyz"
|
|
)
|
|
subprocess.run(
|
|
[
|
|
"docker",
|
|
"run",
|
|
"--rm",
|
|
"-v",
|
|
f"{work_dir.resolve()}:/work",
|
|
gdal_image,
|
|
"bash",
|
|
"-c",
|
|
script,
|
|
],
|
|
check=True,
|
|
)
|
|
if not xyz_dir.exists():
|
|
raise RuntimeError("gdal2tiles produced no output directory")
|
|
return xyz_dir
|
|
|
|
|
|
def _pack_xyz_to_mbtiles(
|
|
xyz_dir: Path,
|
|
mbtiles_path: Path,
|
|
bounds: tuple[float, float, float, float],
|
|
min_zoom: int,
|
|
max_zoom: int,
|
|
attribution: str,
|
|
) -> int:
|
|
"""Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
|
|
if mbtiles_path.exists():
|
|
mbtiles_path.unlink()
|
|
conn = sqlite3.connect(mbtiles_path)
|
|
try:
|
|
conn.execute("PRAGMA journal_mode = WAL")
|
|
conn.execute("PRAGMA synchronous = NORMAL")
|
|
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
|
|
conn.execute(
|
|
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
|
|
"tile_row INTEGER, tile_data BLOB)"
|
|
)
|
|
conn.execute(
|
|
"CREATE UNIQUE INDEX tile_index ON tiles "
|
|
"(zoom_level, tile_column, tile_row)"
|
|
)
|
|
conn.executemany(
|
|
"INSERT INTO metadata (name, value) VALUES (?, ?)",
|
|
[
|
|
("name", "EA Vertical Aerial Photography"),
|
|
("type", "overlay"),
|
|
("version", "1"),
|
|
("description", "Environment Agency high-resolution aerial imagery"),
|
|
("format", "webp"),
|
|
("attribution", attribution),
|
|
("bounds", ",".join(f"{value:.6f}" for value in bounds)),
|
|
("minzoom", str(min_zoom)),
|
|
("maxzoom", str(max_zoom)),
|
|
],
|
|
)
|
|
inserted = 0
|
|
for zoom_dir in sorted(xyz_dir.iterdir()):
|
|
if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
|
|
continue
|
|
zoom = int(zoom_dir.name)
|
|
for col_dir in zoom_dir.iterdir():
|
|
if not col_dir.is_dir() or not col_dir.name.isdigit():
|
|
continue
|
|
col = int(col_dir.name)
|
|
for tile_file in col_dir.glob("*.webp"):
|
|
if not tile_file.stem.isdigit():
|
|
continue
|
|
row = int(tile_file.stem)
|
|
tms_row = (1 << zoom) - 1 - row
|
|
conn.execute(
|
|
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
|
|
(zoom, col, tms_row, tile_file.read_bytes()),
|
|
)
|
|
inserted += 1
|
|
if inserted % 5000 == 0:
|
|
conn.commit()
|
|
print(f" packed {inserted:,} tiles", flush=True)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
return inserted
|
|
|
|
|
|
def build_satellite_highres_tiles(
|
|
output_path: Path,
|
|
pmtiles_bin: Path,
|
|
pmtiles_version: str,
|
|
aoi: dict,
|
|
min_zoom: int,
|
|
max_zoom: int,
|
|
gdal_image: str,
|
|
subscription_key: str | None,
|
|
max_workers: int,
|
|
timeout: float,
|
|
retries: int,
|
|
jobs: int,
|
|
webp_quality: int,
|
|
) -> None:
|
|
if min_zoom > max_zoom:
|
|
raise ValueError("--min-zoom must be <= --max-zoom")
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
|
|
|
|
tiles = search_vap_tiles(aoi)
|
|
if not tiles:
|
|
raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
|
|
key = resolve_subscription_key(subscription_key)
|
|
attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
|
|
|
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
|
work_dir = Path(tmp)
|
|
ecw_dir = work_dir / "ecw"
|
|
ecw_paths = download_tiles(
|
|
tiles, ecw_dir, key, max_workers, timeout, retries
|
|
)
|
|
if not ecw_paths:
|
|
raise RuntimeError("No ECW rasters were extracted from the downloads")
|
|
|
|
xyz_dir = _build_tiles_with_gdal(
|
|
work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
|
|
)
|
|
|
|
mbtiles_path = work_dir / "satellite_highres.mbtiles"
|
|
bounds = _aoi_bounds(aoi)
|
|
inserted = _pack_xyz_to_mbtiles(
|
|
xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
|
|
)
|
|
if inserted == 0:
|
|
raise RuntimeError("Tiling produced no tiles to pack")
|
|
print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
|
|
|
|
subprocess.run(
|
|
[str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
|
|
check=True,
|
|
)
|
|
|
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
|
|
|
|
|
|
def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
|
|
coords = [point for ring in aoi["coordinates"] for point in ring]
|
|
lons = [point[0] for point in coords]
|
|
lats = [point[1] for point in coords]
|
|
return min(lons), min(lats), max(lons), max(lats)
|
|
|
|
|
|
def _load_aoi(path: Path | None) -> dict:
|
|
if path is None:
|
|
return DEFAULT_AOI
|
|
data = json.loads(path.read_text())
|
|
if data.get("type") == "FeatureCollection":
|
|
return data["features"][0]["geometry"]
|
|
if data.get("type") == "Feature":
|
|
return data["geometry"]
|
|
return data
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--output", type=Path, required=True)
|
|
parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
|
|
parser.add_argument("--pmtiles-version", default="1.22.3")
|
|
parser.add_argument(
|
|
"--aoi-geojson",
|
|
type=Path,
|
|
default=None,
|
|
help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
|
|
"(default: Greater London)",
|
|
)
|
|
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
|
|
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
|
|
parser.add_argument(
|
|
"--gdal-image",
|
|
default=DEFAULT_GDAL_IMAGE,
|
|
help="Docker image with a GDAL that has the ECW driver",
|
|
)
|
|
parser.add_argument(
|
|
"--subscription-key",
|
|
default=None,
|
|
help="Override the Defra survey API key (default: scrape, then 'dspui')",
|
|
)
|
|
parser.add_argument("--max-workers", type=int, default=4)
|
|
parser.add_argument("--timeout", type=float, default=600.0)
|
|
parser.add_argument("--retries", type=int, default=3)
|
|
parser.add_argument(
|
|
"--jobs",
|
|
type=int,
|
|
default=8,
|
|
help="Parallel processes for gdal2tiles",
|
|
)
|
|
parser.add_argument(
|
|
"--webp-quality",
|
|
type=int,
|
|
default=85,
|
|
help="WebP tile quality (1-100); lower is smaller",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
build_satellite_highres_tiles(
|
|
output_path=args.output,
|
|
pmtiles_bin=args.pmtiles_bin,
|
|
pmtiles_version=args.pmtiles_version,
|
|
aoi=_load_aoi(args.aoi_geojson),
|
|
min_zoom=args.min_zoom,
|
|
max_zoom=args.max_zoom,
|
|
gdal_image=args.gdal_image,
|
|
subscription_key=args.subscription_key,
|
|
max_workers=max(1, args.max_workers),
|
|
timeout=args.timeout,
|
|
retries=max(0, args.retries),
|
|
jobs=max(1, args.jobs),
|
|
webp_quality=args.webp_quality,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|