"""Download Sentinel-2 cloudless satellite tiles into a local PMTiles archive.""" from __future__ import annotations import argparse import email.utils import http.client import math import sqlite3 import subprocess import tempfile import threading import time import urllib.error import urllib.parse import urllib.request from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from pipeline.download.tiles import ensure_pmtiles_cli from pipeline.local_temp import local_tmp_dir DEFAULT_TILE_URL = ( "https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/" "GoogleMapsCompatible/{z}/{y}/{x}.jpg" ) DEFAULT_BBOX = (-10.5, 49.0, 5.0, 61.0) DEFAULT_MIN_ZOOM = 5 DEFAULT_MAX_ZOOM = 13 DEFAULT_RETRY_COOLDOWN = 15.0 USER_AGENT = "perfect-postcode-satellite-tiles/1.0" RETRYABLE_HTTP_STATUS = {408, 429, 500, 502, 503, 504} ATTRIBUTION = ( "Sentinel-2 cloudless - https://s2maps.eu by EOX IT Services GmbH " "(Contains modified Copernicus Sentinel data 2024)" ) @dataclass(frozen=True) class Tile: zoom: int x: int y: int class _DownloadThrottle: def __init__(self, min_request_interval: float) -> None: self._min_request_interval = max(0.0, min_request_interval) self._next_request_at = 0.0 self._lock = threading.Lock() def wait(self) -> None: while True: with self._lock: now = time.monotonic() wait_for = self._next_request_at - now if wait_for <= 0: if self._min_request_interval: self._next_request_at = now + self._min_request_interval return time.sleep(min(wait_for, 1.0)) def defer(self, delay: float) -> bool: if delay <= 0: return False target = time.monotonic() + delay with self._lock: should_announce = target > self._next_request_at + 1.0 self._next_request_at = max(self._next_request_at, target) return should_announce def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]: lat = max(min(lat, 85.05112878), -85.05112878) n = 1 << zoom x = int(math.floor((lon + 180.0) / 360.0 * n)) y = int( math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n) ) return min(max(x, 0), n - 1), min(max(y, 0), n - 1) def _tile_ranges( bbox: tuple[float, float, float, float], zoom: int ) -> tuple[range, range]: west, south, east, north = bbox min_x, min_y = _lonlat_to_tile(west, north, zoom) max_x, max_y = _lonlat_to_tile(east, south, zoom) return range(min_x, max_x + 1), range(min_y, max_y + 1) def _iter_tiles( bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int ): for zoom in range(min_zoom, max_zoom + 1): x_range, y_range = _tile_ranges(bbox, zoom) for x in x_range: for y in y_range: yield Tile(zoom=zoom, x=x, y=y) def _tile_count( bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int ) -> int: count = 0 for zoom in range(min_zoom, max_zoom + 1): x_range, y_range = _tile_ranges(bbox, zoom) count += len(x_range) * len(y_range) return count def _is_eox_tile_url(url: str) -> bool: host = urllib.parse.urlparse(url).hostname or "" return host == "tiles.maps.eox.at" or host.endswith(".tiles.maps.eox.at") def _retry_after_seconds(headers) -> float | None: raw = None if headers is not None: raw = headers.get("retry-after") or headers.get("Retry-After") if not raw: return None try: return max(0.0, float(raw)) except ValueError: pass try: retry_at = email.utils.parsedate_to_datetime(raw) except (TypeError, ValueError): return None if retry_at.tzinfo is None: retry_at = retry_at.replace(tzinfo=timezone.utc) return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds()) def _http_retry_delay( err: urllib.error.HTTPError, url: str, attempt: int, retry_cooldown: float, ) -> float | None: if err.code in {204, 404}: return None retry_after = _retry_after_seconds(err.headers) if retry_after is not None: return retry_after if err.code == 403 and _is_eox_tile_url(url): return retry_cooldown if err.code in RETRYABLE_HTTP_STATUS: return min(2.0, 0.25 * (2**attempt)) return None def _fetch_tile( tile: Tile, source_url: str, timeout: float, retries: int, throttle: _DownloadThrottle, retry_cooldown: float, ) -> tuple[Tile, bytes | None]: url = source_url.format(z=tile.zoom, x=tile.x, y=tile.y) last_error: Exception | None = None for attempt in range(retries + 1): try: throttle.wait() req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) with urllib.request.urlopen(req, timeout=timeout) as response: content_type = response.headers.get("content-type", "") data = response.read() if not data: return tile, None if not content_type.lower().startswith("image/"): raise RuntimeError( f"Unexpected content type for {url}: {content_type or 'unknown'}" ) return tile, data except urllib.error.HTTPError as err: if err.code in {204, 404}: return tile, None retry_delay = _http_retry_delay(err, url, attempt, retry_cooldown) if retry_delay is None: raise RuntimeError( f"Failed to download satellite tile {url}: {err}" ) from err last_error = err except ( TimeoutError, urllib.error.URLError, ConnectionError, http.client.HTTPException, RuntimeError, ) as err: last_error = err retry_delay = min(2.0, 0.25 * (2**attempt)) if attempt < retries: if throttle.defer(retry_delay) and retry_delay >= 5.0: print( f"Satellite tile source returned {last_error}; " f"pausing downloads for {retry_delay:.0f}s before retrying", flush=True, ) assert last_error is not None raise RuntimeError(f"Failed to download satellite tile {url}: {last_error}") from last_error def _create_mbtiles( mbtiles_path: Path, bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int, source_url: str, max_workers: int, timeout: float, retries: int, retry_cooldown: float, min_request_interval: float, ) -> int: if mbtiles_path.exists(): mbtiles_path.unlink() conn = sqlite3.connect(mbtiles_path) conn.execute("PRAGMA journal_mode = WAL") conn.execute("PRAGMA synchronous = NORMAL") conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)") conn.execute( "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, " "tile_row INTEGER, tile_data BLOB)" ) conn.execute( "CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)" ) conn.executemany( "INSERT INTO metadata (name, value) VALUES (?, ?)", [ ("name", "Sentinel-2 cloudless satellite basemap"), ("type", "baselayer"), ("version", "1"), ("description", "Sentinel-2 cloudless Web Mercator satellite imagery"), ("format", "jpg"), ("attribution", ATTRIBUTION), ("bounds", ",".join(f"{value:.6f}" for value in bbox)), ("minzoom", str(min_zoom)), ("maxzoom", str(max_zoom)), ], ) total = _tile_count(bbox, min_zoom, max_zoom) inserted = 0 completed = 0 submitted = 0 tiles = iter(_iter_tiles(bbox, min_zoom, max_zoom)) pending: set[Future[tuple[Tile, bytes | None]]] = set() queue_size = max_workers * 4 throttle = _DownloadThrottle(min_request_interval=min_request_interval) def submit_next(executor: ThreadPoolExecutor) -> bool: nonlocal submitted try: tile = next(tiles) except StopIteration: return False pending.add( executor.submit( _fetch_tile, tile, source_url, timeout, retries, throttle, retry_cooldown, ) ) submitted += 1 return True try: with ThreadPoolExecutor(max_workers=max_workers) as executor: for _ in range(queue_size): if not submit_next(executor): break while pending: done, pending = wait(pending, return_when=FIRST_COMPLETED) for future in done: tile, tile_data = future.result() completed += 1 if tile_data is not None: tms_y = (1 << tile.zoom) - 1 - tile.y conn.execute( "INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)", (tile.zoom, tile.x, tms_y, tile_data), ) inserted += 1 submit_next(executor) if completed % 1000 == 0 or completed == total: conn.commit() print( f"Downloaded {completed:,}/{total:,} satellite tiles " f"({inserted:,} stored)", flush=True, ) finally: conn.commit() conn.close() return inserted def build_satellite_tiles( output_path: Path, pmtiles_bin: Path, pmtiles_version: str, bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int, source_url: str, max_workers: int, timeout: float, retries: int, retry_cooldown: float, min_request_interval: float, ) -> None: if min_zoom > max_zoom: raise ValueError("--min-zoom must be <= --max-zoom") if len(bbox) != 4 or bbox[0] >= bbox[2] or bbox[1] >= bbox[3]: raise ValueError("--bbox must be west,south,east,north") output_path.parent.mkdir(parents=True, exist_ok=True) ensure_pmtiles_cli(pmtiles_bin, pmtiles_version) with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: mbtiles_path = Path(tmp) / "satellite.mbtiles" tile_count = _create_mbtiles( mbtiles_path=mbtiles_path, bbox=bbox, min_zoom=min_zoom, max_zoom=max_zoom, source_url=source_url, max_workers=max_workers, timeout=timeout, retries=retries, retry_cooldown=retry_cooldown, min_request_interval=min_request_interval, ) if tile_count == 0: raise RuntimeError("Satellite tile download produced no tiles") subprocess.run( [ str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force", ], check=True, ) size_mb = output_path.stat().st_size / (1024 * 1024) print(f"Wrote {output_path} ({size_mb:.1f} MB)", flush=True) def _parse_bbox(raw: str) -> tuple[float, float, float, float]: parts = [float(part.strip()) for part in raw.split(",")] if len(parts) != 4: raise argparse.ArgumentTypeError("bbox must contain four comma-separated numbers") return parts[0], parts[1], parts[2], parts[3] def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--output", type=Path, required=True) parser.add_argument( "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles") ) parser.add_argument("--pmtiles-version", default="1.22.3") parser.add_argument("--bbox", type=_parse_bbox, default=DEFAULT_BBOX) parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM) parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM) parser.add_argument("--source-url", default=DEFAULT_TILE_URL) parser.add_argument("--max-workers", type=int, default=8) parser.add_argument("--timeout", type=float, default=20.0) parser.add_argument("--retries", type=int, default=3) parser.add_argument( "--retry-cooldown", type=float, default=DEFAULT_RETRY_COOLDOWN, help="Seconds to pause all workers after an EOX rate-limit response", ) parser.add_argument( "--min-request-interval", type=float, default=0.0, help="Minimum seconds between tile requests across all workers", ) args = parser.parse_args() build_satellite_tiles( output_path=args.output, pmtiles_bin=args.pmtiles_bin, pmtiles_version=args.pmtiles_version, bbox=args.bbox, min_zoom=args.min_zoom, max_zoom=args.max_zoom, source_url=args.source_url, max_workers=max(1, args.max_workers), timeout=args.timeout, retries=max(0, args.retries), retry_cooldown=max(0.0, args.retry_cooldown), min_request_interval=max(0.0, args.min_request_interval), ) if __name__ == "__main__": main()