432 lines
14 KiB
Python
432 lines
14 KiB
Python
"""Download Sentinel-2 cloudless satellite tiles into a local PMTiles archive."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import email.utils
|
|
import http.client
|
|
import math
|
|
import sqlite3
|
|
import subprocess
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from pipeline.download.tiles import ensure_pmtiles_cli
|
|
from pipeline.local_temp import local_tmp_dir
|
|
|
|
DEFAULT_TILE_URL = (
|
|
"https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/"
|
|
"GoogleMapsCompatible/{z}/{y}/{x}.jpg"
|
|
)
|
|
DEFAULT_BBOX = (-10.5, 49.0, 5.0, 61.0)
|
|
DEFAULT_MIN_ZOOM = 5
|
|
DEFAULT_MAX_ZOOM = 13
|
|
DEFAULT_RETRY_COOLDOWN = 15.0
|
|
USER_AGENT = "perfect-postcode-satellite-tiles/1.0"
|
|
RETRYABLE_HTTP_STATUS = {408, 429, 500, 502, 503, 504}
|
|
ATTRIBUTION = (
|
|
"Sentinel-2 cloudless - https://s2maps.eu by EOX IT Services GmbH "
|
|
"(Contains modified Copernicus Sentinel data 2024)"
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Tile:
|
|
zoom: int
|
|
x: int
|
|
y: int
|
|
|
|
|
|
class _DownloadThrottle:
|
|
def __init__(self, min_request_interval: float) -> None:
|
|
self._min_request_interval = max(0.0, min_request_interval)
|
|
self._next_request_at = 0.0
|
|
self._lock = threading.Lock()
|
|
|
|
def wait(self) -> None:
|
|
while True:
|
|
with self._lock:
|
|
now = time.monotonic()
|
|
wait_for = self._next_request_at - now
|
|
if wait_for <= 0:
|
|
if self._min_request_interval:
|
|
self._next_request_at = now + self._min_request_interval
|
|
return
|
|
|
|
time.sleep(min(wait_for, 1.0))
|
|
|
|
def defer(self, delay: float) -> bool:
|
|
if delay <= 0:
|
|
return False
|
|
|
|
target = time.monotonic() + delay
|
|
with self._lock:
|
|
should_announce = target > self._next_request_at + 1.0
|
|
self._next_request_at = max(self._next_request_at, target)
|
|
return should_announce
|
|
|
|
|
|
def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]:
|
|
lat = max(min(lat, 85.05112878), -85.05112878)
|
|
n = 1 << zoom
|
|
x = int(math.floor((lon + 180.0) / 360.0 * n))
|
|
y = int(
|
|
math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n)
|
|
)
|
|
return min(max(x, 0), n - 1), min(max(y, 0), n - 1)
|
|
|
|
|
|
def _tile_ranges(
|
|
bbox: tuple[float, float, float, float], zoom: int
|
|
) -> tuple[range, range]:
|
|
west, south, east, north = bbox
|
|
min_x, min_y = _lonlat_to_tile(west, north, zoom)
|
|
max_x, max_y = _lonlat_to_tile(east, south, zoom)
|
|
return range(min_x, max_x + 1), range(min_y, max_y + 1)
|
|
|
|
|
|
def _iter_tiles(
|
|
bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int
|
|
):
|
|
for zoom in range(min_zoom, max_zoom + 1):
|
|
x_range, y_range = _tile_ranges(bbox, zoom)
|
|
for x in x_range:
|
|
for y in y_range:
|
|
yield Tile(zoom=zoom, x=x, y=y)
|
|
|
|
|
|
def _tile_count(
|
|
bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int
|
|
) -> int:
|
|
count = 0
|
|
for zoom in range(min_zoom, max_zoom + 1):
|
|
x_range, y_range = _tile_ranges(bbox, zoom)
|
|
count += len(x_range) * len(y_range)
|
|
return count
|
|
|
|
|
|
def _is_eox_tile_url(url: str) -> bool:
|
|
host = urllib.parse.urlparse(url).hostname or ""
|
|
return host == "tiles.maps.eox.at" or host.endswith(".tiles.maps.eox.at")
|
|
|
|
|
|
def _retry_after_seconds(headers) -> float | None:
|
|
raw = None
|
|
if headers is not None:
|
|
raw = headers.get("retry-after") or headers.get("Retry-After")
|
|
if not raw:
|
|
return None
|
|
|
|
try:
|
|
return max(0.0, float(raw))
|
|
except ValueError:
|
|
pass
|
|
|
|
try:
|
|
retry_at = email.utils.parsedate_to_datetime(raw)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
if retry_at.tzinfo is None:
|
|
retry_at = retry_at.replace(tzinfo=timezone.utc)
|
|
return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds())
|
|
|
|
|
|
def _http_retry_delay(
|
|
err: urllib.error.HTTPError,
|
|
url: str,
|
|
attempt: int,
|
|
retry_cooldown: float,
|
|
) -> float | None:
|
|
if err.code in {204, 404}:
|
|
return None
|
|
|
|
retry_after = _retry_after_seconds(err.headers)
|
|
if retry_after is not None:
|
|
return retry_after
|
|
|
|
if err.code == 403 and _is_eox_tile_url(url):
|
|
return retry_cooldown
|
|
|
|
if err.code in RETRYABLE_HTTP_STATUS:
|
|
return min(2.0, 0.25 * (2**attempt))
|
|
|
|
return None
|
|
|
|
|
|
def _fetch_tile(
|
|
tile: Tile,
|
|
source_url: str,
|
|
timeout: float,
|
|
retries: int,
|
|
throttle: _DownloadThrottle,
|
|
retry_cooldown: float,
|
|
) -> tuple[Tile, bytes | None]:
|
|
url = source_url.format(z=tile.zoom, x=tile.x, y=tile.y)
|
|
last_error: Exception | None = None
|
|
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
throttle.wait()
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
content_type = response.headers.get("content-type", "")
|
|
data = response.read()
|
|
if not data:
|
|
return tile, None
|
|
if not content_type.lower().startswith("image/"):
|
|
raise RuntimeError(
|
|
f"Unexpected content type for {url}: {content_type or 'unknown'}"
|
|
)
|
|
return tile, data
|
|
except urllib.error.HTTPError as err:
|
|
if err.code in {204, 404}:
|
|
return tile, None
|
|
retry_delay = _http_retry_delay(err, url, attempt, retry_cooldown)
|
|
if retry_delay is None:
|
|
raise RuntimeError(
|
|
f"Failed to download satellite tile {url}: {err}"
|
|
) from err
|
|
last_error = err
|
|
except (
|
|
TimeoutError,
|
|
urllib.error.URLError,
|
|
ConnectionError,
|
|
http.client.HTTPException,
|
|
RuntimeError,
|
|
) as err:
|
|
last_error = err
|
|
retry_delay = min(2.0, 0.25 * (2**attempt))
|
|
|
|
if attempt < retries:
|
|
if throttle.defer(retry_delay) and retry_delay >= 5.0:
|
|
print(
|
|
f"Satellite tile source returned {last_error}; "
|
|
f"pausing downloads for {retry_delay:.0f}s before retrying",
|
|
flush=True,
|
|
)
|
|
|
|
assert last_error is not None
|
|
raise RuntimeError(f"Failed to download satellite tile {url}: {last_error}") from last_error
|
|
|
|
|
|
def _create_mbtiles(
|
|
mbtiles_path: Path,
|
|
bbox: tuple[float, float, float, float],
|
|
min_zoom: int,
|
|
max_zoom: int,
|
|
source_url: str,
|
|
max_workers: int,
|
|
timeout: float,
|
|
retries: int,
|
|
retry_cooldown: float,
|
|
min_request_interval: float,
|
|
) -> int:
|
|
if mbtiles_path.exists():
|
|
mbtiles_path.unlink()
|
|
|
|
conn = sqlite3.connect(mbtiles_path)
|
|
conn.execute("PRAGMA journal_mode = WAL")
|
|
conn.execute("PRAGMA synchronous = NORMAL")
|
|
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
|
|
conn.execute(
|
|
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
|
|
"tile_row INTEGER, tile_data BLOB)"
|
|
)
|
|
conn.execute(
|
|
"CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)"
|
|
)
|
|
conn.executemany(
|
|
"INSERT INTO metadata (name, value) VALUES (?, ?)",
|
|
[
|
|
("name", "Sentinel-2 cloudless satellite basemap"),
|
|
("type", "baselayer"),
|
|
("version", "1"),
|
|
("description", "Sentinel-2 cloudless Web Mercator satellite imagery"),
|
|
("format", "jpg"),
|
|
("attribution", ATTRIBUTION),
|
|
("bounds", ",".join(f"{value:.6f}" for value in bbox)),
|
|
("minzoom", str(min_zoom)),
|
|
("maxzoom", str(max_zoom)),
|
|
],
|
|
)
|
|
|
|
total = _tile_count(bbox, min_zoom, max_zoom)
|
|
inserted = 0
|
|
completed = 0
|
|
submitted = 0
|
|
tiles = iter(_iter_tiles(bbox, min_zoom, max_zoom))
|
|
pending: set[Future[tuple[Tile, bytes | None]]] = set()
|
|
queue_size = max_workers * 4
|
|
throttle = _DownloadThrottle(min_request_interval=min_request_interval)
|
|
|
|
def submit_next(executor: ThreadPoolExecutor) -> bool:
|
|
nonlocal submitted
|
|
try:
|
|
tile = next(tiles)
|
|
except StopIteration:
|
|
return False
|
|
pending.add(
|
|
executor.submit(
|
|
_fetch_tile,
|
|
tile,
|
|
source_url,
|
|
timeout,
|
|
retries,
|
|
throttle,
|
|
retry_cooldown,
|
|
)
|
|
)
|
|
submitted += 1
|
|
return True
|
|
|
|
try:
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
for _ in range(queue_size):
|
|
if not submit_next(executor):
|
|
break
|
|
|
|
while pending:
|
|
done, pending = wait(pending, return_when=FIRST_COMPLETED)
|
|
for future in done:
|
|
tile, tile_data = future.result()
|
|
completed += 1
|
|
if tile_data is not None:
|
|
tms_y = (1 << tile.zoom) - 1 - tile.y
|
|
conn.execute(
|
|
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
|
|
(tile.zoom, tile.x, tms_y, tile_data),
|
|
)
|
|
inserted += 1
|
|
|
|
submit_next(executor)
|
|
|
|
if completed % 1000 == 0 or completed == total:
|
|
conn.commit()
|
|
print(
|
|
f"Downloaded {completed:,}/{total:,} satellite tiles "
|
|
f"({inserted:,} stored)",
|
|
flush=True,
|
|
)
|
|
finally:
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
return inserted
|
|
|
|
|
|
def build_satellite_tiles(
|
|
output_path: Path,
|
|
pmtiles_bin: Path,
|
|
pmtiles_version: str,
|
|
bbox: tuple[float, float, float, float],
|
|
min_zoom: int,
|
|
max_zoom: int,
|
|
source_url: str,
|
|
max_workers: int,
|
|
timeout: float,
|
|
retries: int,
|
|
retry_cooldown: float,
|
|
min_request_interval: float,
|
|
) -> None:
|
|
if min_zoom > max_zoom:
|
|
raise ValueError("--min-zoom must be <= --max-zoom")
|
|
if len(bbox) != 4 or bbox[0] >= bbox[2] or bbox[1] >= bbox[3]:
|
|
raise ValueError("--bbox must be west,south,east,north")
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
|
|
|
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
|
mbtiles_path = Path(tmp) / "satellite.mbtiles"
|
|
tile_count = _create_mbtiles(
|
|
mbtiles_path=mbtiles_path,
|
|
bbox=bbox,
|
|
min_zoom=min_zoom,
|
|
max_zoom=max_zoom,
|
|
source_url=source_url,
|
|
max_workers=max_workers,
|
|
timeout=timeout,
|
|
retries=retries,
|
|
retry_cooldown=retry_cooldown,
|
|
min_request_interval=min_request_interval,
|
|
)
|
|
if tile_count == 0:
|
|
raise RuntimeError("Satellite tile download produced no tiles")
|
|
|
|
subprocess.run(
|
|
[
|
|
str(pmtiles_bin),
|
|
"convert",
|
|
str(mbtiles_path),
|
|
str(output_path),
|
|
"--force",
|
|
],
|
|
check=True,
|
|
)
|
|
|
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {output_path} ({size_mb:.1f} MB)", flush=True)
|
|
|
|
|
|
def _parse_bbox(raw: str) -> tuple[float, float, float, float]:
|
|
parts = [float(part.strip()) for part in raw.split(",")]
|
|
if len(parts) != 4:
|
|
raise argparse.ArgumentTypeError("bbox must contain four comma-separated numbers")
|
|
return parts[0], parts[1], parts[2], parts[3]
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--output", type=Path, required=True)
|
|
parser.add_argument(
|
|
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
|
|
)
|
|
parser.add_argument("--pmtiles-version", default="1.22.3")
|
|
parser.add_argument("--bbox", type=_parse_bbox, default=DEFAULT_BBOX)
|
|
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
|
|
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
|
|
parser.add_argument("--source-url", default=DEFAULT_TILE_URL)
|
|
parser.add_argument("--max-workers", type=int, default=8)
|
|
parser.add_argument("--timeout", type=float, default=20.0)
|
|
parser.add_argument("--retries", type=int, default=3)
|
|
parser.add_argument(
|
|
"--retry-cooldown",
|
|
type=float,
|
|
default=DEFAULT_RETRY_COOLDOWN,
|
|
help="Seconds to pause all workers after an EOX rate-limit response",
|
|
)
|
|
parser.add_argument(
|
|
"--min-request-interval",
|
|
type=float,
|
|
default=0.0,
|
|
help="Minimum seconds between tile requests across all workers",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
build_satellite_tiles(
|
|
output_path=args.output,
|
|
pmtiles_bin=args.pmtiles_bin,
|
|
pmtiles_version=args.pmtiles_version,
|
|
bbox=args.bbox,
|
|
min_zoom=args.min_zoom,
|
|
max_zoom=args.max_zoom,
|
|
source_url=args.source_url,
|
|
max_workers=max(1, args.max_workers),
|
|
timeout=args.timeout,
|
|
retries=max(0, args.retries),
|
|
retry_cooldown=max(0.0, args.retry_cooldown),
|
|
min_request_interval=max(0.0, args.min_request_interval),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|