diff --git a/pipeline/download/satellite_highres.py b/pipeline/download/satellite_highres.py index 0de2d72..379fd6e 100644 --- a/pipeline/download/satellite_highres.py +++ b/pipeline/download/satellite_highres.py @@ -183,9 +183,19 @@ def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]: def _download_and_extract( tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int ) -> list[Path]: - """Download one survey zip and extract its ECW raster(s).""" + """Download one survey zip and extract its ECW raster(s). + + The Defra endpoint sometimes answers a download with a 200 whose body is an + error/HTML page or a truncated stream rather than a ZIP (rate limiting, a + rotated key, a transient backend hiccup). We treat a non-ZIP body the same as + a network error -- retry it -- and, if every attempt fails, skip just this + tile rather than aborting the whole multi-tile run: missing coverage simply + shows the Sentinel-2 base through, and the caller still hard-fails if *no* + ECWs were extracted at all. + """ url = f"{tile.uri}?subscription-key={key}" zip_path = ecw_dir / f"{tile.os_tile_id}.zip" + last_err: Exception | None = None for attempt in range(retries + 1): try: with urllib.request.urlopen( @@ -193,10 +203,25 @@ def _download_and_extract( timeout=timeout, ) as response, zip_path.open("wb") as out: shutil.copyfileobj(response, out, length=1 << 20) - break - except (urllib.error.URLError, TimeoutError, ConnectionError) as err: - if attempt == retries: - raise RuntimeError(f"Failed to download {url}: {err}") from err + return _extract_ecws(zip_path, tile, ecw_dir) + except ( + urllib.error.URLError, + TimeoutError, + ConnectionError, + zipfile.BadZipFile, + ) as err: + last_err = err + zip_path.unlink(missing_ok=True) + print( + f" {tile.os_tile_id}: download failed after {retries + 1} attempt(s), " + f"skipping ({last_err})", + flush=True, + ) + return [] + + +def _extract_ecws(zip_path: Path, tile: VapTile, ecw_dir: Path) -> list[Path]: + """Extract every ECW raster from a downloaded survey zip; delete the zip.""" extracted: list[Path] = [] with zipfile.ZipFile(zip_path) as archive: for member in archive.infolist(): diff --git a/pipeline/download/test_satellite_highres.py b/pipeline/download/test_satellite_highres.py index dc84fa3..f678931 100644 --- a/pipeline/download/test_satellite_highres.py +++ b/pipeline/download/test_satellite_highres.py @@ -1,3 +1,6 @@ +import io +import zipfile + from pipeline.download import satellite_highres from pipeline.download.satellite_highres import ( VapTile, @@ -95,3 +98,76 @@ def test_select_best_rgb_breaks_resolution_ties_by_year() -> None: def test_select_best_rgb_empty_when_no_rgb() -> None: payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]} assert select_best_rgb_tiles(parse_search_results(payload)) == [] + + +_TILE = VapTile( + product_id=satellite_highres.VAP_RGB_PRODUCT, + year=2008, + resolution_m=0.4, + os_tile_id="TQ2575", + uri="https://example.invalid/tile", + label="t", +) + + +def _zip_with_ecw() -> bytes: + """A minimal valid survey zip carrying one ECW member.""" + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w") as archive: + archive.writestr("TQ2575.ecw", b"ecw-pixels") + archive.writestr("readme.txt", b"ignored") + return buffer.getvalue() + + +def _fake_urlopen(bodies: list[bytes]): + """Return a urlopen stand-in that yields each body in turn as the response.""" + queue = list(bodies) + + def opener(_request, timeout=None): # noqa: ANN001 - matches urlopen signature + return io.BytesIO(queue.pop(0)) + + return opener + + +def test_download_extracts_ecw_from_valid_zip(tmp_path, monkeypatch) -> None: + monkeypatch.setattr( + satellite_highres.urllib.request, + "urlopen", + _fake_urlopen([_zip_with_ecw()]), + ) + paths = satellite_highres._download_and_extract( + _TILE, tmp_path, key="k", timeout=1.0, retries=2 + ) + assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"] + assert paths[0].read_bytes() == b"ecw-pixels" + # The transient zip is cleaned up regardless. + assert not (tmp_path / "TQ2575.zip").exists() + + +def test_download_retries_past_a_corrupt_body(tmp_path, monkeypatch) -> None: + # First response is a non-zip error page; the retry serves a real zip. + monkeypatch.setattr( + satellite_highres.urllib.request, + "urlopen", + _fake_urlopen([b"rate limited", _zip_with_ecw()]), + ) + paths = satellite_highres._download_and_extract( + _TILE, tmp_path, key="k", timeout=1.0, retries=2 + ) + assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"] + + +def test_download_skips_tile_when_all_attempts_are_corrupt( + tmp_path, monkeypatch +) -> None: + # Every attempt returns a non-zip body: the tile is skipped, not raised. + monkeypatch.setattr( + satellite_highres.urllib.request, + "urlopen", + _fake_urlopen([b"not a zip"] * 3), + ) + paths = satellite_highres._download_and_extract( + _TILE, tmp_path, key="k", timeout=1.0, retries=2 + ) + assert paths == [] + assert list(tmp_path.glob("*")) == [] # no leftover zip or partial ecw