perfect-postcode/pipeline/download/satellite_tiles.py
2026-05-28 21:48:35 +01:00

432 lines
14 KiB
Python

"""Download Sentinel-2 cloudless satellite tiles into a local PMTiles archive."""
from __future__ import annotations
import argparse
import email.utils
import http.client
import math
import sqlite3
import subprocess
import tempfile
import threading
import time
import urllib.error
import urllib.parse
import urllib.request
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from pipeline.download.tiles import ensure_pmtiles_cli
from pipeline.local_temp import local_tmp_dir
DEFAULT_TILE_URL = (
"https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/"
"GoogleMapsCompatible/{z}/{y}/{x}.jpg"
)
DEFAULT_BBOX = (-10.5, 49.0, 5.0, 61.0)
DEFAULT_MIN_ZOOM = 5
DEFAULT_MAX_ZOOM = 13
DEFAULT_RETRY_COOLDOWN = 15.0
USER_AGENT = "perfect-postcode-satellite-tiles/1.0"
RETRYABLE_HTTP_STATUS = {408, 429, 500, 502, 503, 504}
ATTRIBUTION = (
"Sentinel-2 cloudless - https://s2maps.eu by EOX IT Services GmbH "
"(Contains modified Copernicus Sentinel data 2024)"
)
@dataclass(frozen=True)
class Tile:
zoom: int
x: int
y: int
class _DownloadThrottle:
def __init__(self, min_request_interval: float) -> None:
self._min_request_interval = max(0.0, min_request_interval)
self._next_request_at = 0.0
self._lock = threading.Lock()
def wait(self) -> None:
while True:
with self._lock:
now = time.monotonic()
wait_for = self._next_request_at - now
if wait_for <= 0:
if self._min_request_interval:
self._next_request_at = now + self._min_request_interval
return
time.sleep(min(wait_for, 1.0))
def defer(self, delay: float) -> bool:
if delay <= 0:
return False
target = time.monotonic() + delay
with self._lock:
should_announce = target > self._next_request_at + 1.0
self._next_request_at = max(self._next_request_at, target)
return should_announce
def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]:
lat = max(min(lat, 85.05112878), -85.05112878)
n = 1 << zoom
x = int(math.floor((lon + 180.0) / 360.0 * n))
y = int(
math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n)
)
return min(max(x, 0), n - 1), min(max(y, 0), n - 1)
def _tile_ranges(
bbox: tuple[float, float, float, float], zoom: int
) -> tuple[range, range]:
west, south, east, north = bbox
min_x, min_y = _lonlat_to_tile(west, north, zoom)
max_x, max_y = _lonlat_to_tile(east, south, zoom)
return range(min_x, max_x + 1), range(min_y, max_y + 1)
def _iter_tiles(
bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int
):
for zoom in range(min_zoom, max_zoom + 1):
x_range, y_range = _tile_ranges(bbox, zoom)
for x in x_range:
for y in y_range:
yield Tile(zoom=zoom, x=x, y=y)
def _tile_count(
bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int
) -> int:
count = 0
for zoom in range(min_zoom, max_zoom + 1):
x_range, y_range = _tile_ranges(bbox, zoom)
count += len(x_range) * len(y_range)
return count
def _is_eox_tile_url(url: str) -> bool:
host = urllib.parse.urlparse(url).hostname or ""
return host == "tiles.maps.eox.at" or host.endswith(".tiles.maps.eox.at")
def _retry_after_seconds(headers) -> float | None:
raw = None
if headers is not None:
raw = headers.get("retry-after") or headers.get("Retry-After")
if not raw:
return None
try:
return max(0.0, float(raw))
except ValueError:
pass
try:
retry_at = email.utils.parsedate_to_datetime(raw)
except (TypeError, ValueError):
return None
if retry_at.tzinfo is None:
retry_at = retry_at.replace(tzinfo=timezone.utc)
return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds())
def _http_retry_delay(
err: urllib.error.HTTPError,
url: str,
attempt: int,
retry_cooldown: float,
) -> float | None:
if err.code in {204, 404}:
return None
retry_after = _retry_after_seconds(err.headers)
if retry_after is not None:
return retry_after
if err.code == 403 and _is_eox_tile_url(url):
return retry_cooldown
if err.code in RETRYABLE_HTTP_STATUS:
return min(2.0, 0.25 * (2**attempt))
return None
def _fetch_tile(
tile: Tile,
source_url: str,
timeout: float,
retries: int,
throttle: _DownloadThrottle,
retry_cooldown: float,
) -> tuple[Tile, bytes | None]:
url = source_url.format(z=tile.zoom, x=tile.x, y=tile.y)
last_error: Exception | None = None
for attempt in range(retries + 1):
try:
throttle.wait()
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=timeout) as response:
content_type = response.headers.get("content-type", "")
data = response.read()
if not data:
return tile, None
if not content_type.lower().startswith("image/"):
raise RuntimeError(
f"Unexpected content type for {url}: {content_type or 'unknown'}"
)
return tile, data
except urllib.error.HTTPError as err:
if err.code in {204, 404}:
return tile, None
retry_delay = _http_retry_delay(err, url, attempt, retry_cooldown)
if retry_delay is None:
raise RuntimeError(
f"Failed to download satellite tile {url}: {err}"
) from err
last_error = err
except (
TimeoutError,
urllib.error.URLError,
ConnectionError,
http.client.HTTPException,
RuntimeError,
) as err:
last_error = err
retry_delay = min(2.0, 0.25 * (2**attempt))
if attempt < retries:
if throttle.defer(retry_delay) and retry_delay >= 5.0:
print(
f"Satellite tile source returned {last_error}; "
f"pausing downloads for {retry_delay:.0f}s before retrying",
flush=True,
)
assert last_error is not None
raise RuntimeError(f"Failed to download satellite tile {url}: {last_error}") from last_error
def _create_mbtiles(
mbtiles_path: Path,
bbox: tuple[float, float, float, float],
min_zoom: int,
max_zoom: int,
source_url: str,
max_workers: int,
timeout: float,
retries: int,
retry_cooldown: float,
min_request_interval: float,
) -> int:
if mbtiles_path.exists():
mbtiles_path.unlink()
conn = sqlite3.connect(mbtiles_path)
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
conn.execute(
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
"tile_row INTEGER, tile_data BLOB)"
)
conn.execute(
"CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)"
)
conn.executemany(
"INSERT INTO metadata (name, value) VALUES (?, ?)",
[
("name", "Sentinel-2 cloudless satellite basemap"),
("type", "baselayer"),
("version", "1"),
("description", "Sentinel-2 cloudless Web Mercator satellite imagery"),
("format", "jpg"),
("attribution", ATTRIBUTION),
("bounds", ",".join(f"{value:.6f}" for value in bbox)),
("minzoom", str(min_zoom)),
("maxzoom", str(max_zoom)),
],
)
total = _tile_count(bbox, min_zoom, max_zoom)
inserted = 0
completed = 0
submitted = 0
tiles = iter(_iter_tiles(bbox, min_zoom, max_zoom))
pending: set[Future[tuple[Tile, bytes | None]]] = set()
queue_size = max_workers * 4
throttle = _DownloadThrottle(min_request_interval=min_request_interval)
def submit_next(executor: ThreadPoolExecutor) -> bool:
nonlocal submitted
try:
tile = next(tiles)
except StopIteration:
return False
pending.add(
executor.submit(
_fetch_tile,
tile,
source_url,
timeout,
retries,
throttle,
retry_cooldown,
)
)
submitted += 1
return True
try:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for _ in range(queue_size):
if not submit_next(executor):
break
while pending:
done, pending = wait(pending, return_when=FIRST_COMPLETED)
for future in done:
tile, tile_data = future.result()
completed += 1
if tile_data is not None:
tms_y = (1 << tile.zoom) - 1 - tile.y
conn.execute(
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
(tile.zoom, tile.x, tms_y, tile_data),
)
inserted += 1
submit_next(executor)
if completed % 1000 == 0 or completed == total:
conn.commit()
print(
f"Downloaded {completed:,}/{total:,} satellite tiles "
f"({inserted:,} stored)",
flush=True,
)
finally:
conn.commit()
conn.close()
return inserted
def build_satellite_tiles(
output_path: Path,
pmtiles_bin: Path,
pmtiles_version: str,
bbox: tuple[float, float, float, float],
min_zoom: int,
max_zoom: int,
source_url: str,
max_workers: int,
timeout: float,
retries: int,
retry_cooldown: float,
min_request_interval: float,
) -> None:
if min_zoom > max_zoom:
raise ValueError("--min-zoom must be <= --max-zoom")
if len(bbox) != 4 or bbox[0] >= bbox[2] or bbox[1] >= bbox[3]:
raise ValueError("--bbox must be west,south,east,north")
output_path.parent.mkdir(parents=True, exist_ok=True)
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
mbtiles_path = Path(tmp) / "satellite.mbtiles"
tile_count = _create_mbtiles(
mbtiles_path=mbtiles_path,
bbox=bbox,
min_zoom=min_zoom,
max_zoom=max_zoom,
source_url=source_url,
max_workers=max_workers,
timeout=timeout,
retries=retries,
retry_cooldown=retry_cooldown,
min_request_interval=min_request_interval,
)
if tile_count == 0:
raise RuntimeError("Satellite tile download produced no tiles")
subprocess.run(
[
str(pmtiles_bin),
"convert",
str(mbtiles_path),
str(output_path),
"--force",
],
check=True,
)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB)", flush=True)
def _parse_bbox(raw: str) -> tuple[float, float, float, float]:
parts = [float(part.strip()) for part in raw.split(",")]
if len(parts) != 4:
raise argparse.ArgumentTypeError("bbox must contain four comma-separated numbers")
return parts[0], parts[1], parts[2], parts[3]
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
)
parser.add_argument("--pmtiles-version", default="1.22.3")
parser.add_argument("--bbox", type=_parse_bbox, default=DEFAULT_BBOX)
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
parser.add_argument("--source-url", default=DEFAULT_TILE_URL)
parser.add_argument("--max-workers", type=int, default=8)
parser.add_argument("--timeout", type=float, default=20.0)
parser.add_argument("--retries", type=int, default=3)
parser.add_argument(
"--retry-cooldown",
type=float,
default=DEFAULT_RETRY_COOLDOWN,
help="Seconds to pause all workers after an EOX rate-limit response",
)
parser.add_argument(
"--min-request-interval",
type=float,
default=0.0,
help="Minimum seconds between tile requests across all workers",
)
args = parser.parse_args()
build_satellite_tiles(
output_path=args.output,
pmtiles_bin=args.pmtiles_bin,
pmtiles_version=args.pmtiles_version,
bbox=args.bbox,
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
source_url=args.source_url,
max_workers=max(1, args.max_workers),
timeout=args.timeout,
retries=max(0, args.retries),
retry_cooldown=max(0.0, args.retry_cooldown),
min_request_interval=max(0.0, args.min_request_interval),
)
if __name__ == "__main__":
main()