Fix flake

This commit is contained in:
Andras Schmelczer 2026-06-06 10:45:45 +01:00
parent df63764a9f
commit 3e307c2fff
2 changed files with 106 additions and 5 deletions

View file

@ -183,9 +183,19 @@ def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
def _download_and_extract(
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
) -> list[Path]:
"""Download one survey zip and extract its ECW raster(s)."""
"""Download one survey zip and extract its ECW raster(s).
The Defra endpoint sometimes answers a download with a 200 whose body is an
error/HTML page or a truncated stream rather than a ZIP (rate limiting, a
rotated key, a transient backend hiccup). We treat a non-ZIP body the same as
a network error -- retry it -- and, if every attempt fails, skip just this
tile rather than aborting the whole multi-tile run: missing coverage simply
shows the Sentinel-2 base through, and the caller still hard-fails if *no*
ECWs were extracted at all.
"""
url = f"{tile.uri}?subscription-key={key}"
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
last_err: Exception | None = None
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(
@ -193,10 +203,25 @@ def _download_and_extract(
timeout=timeout,
) as response, zip_path.open("wb") as out:
shutil.copyfileobj(response, out, length=1 << 20)
break
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
if attempt == retries:
raise RuntimeError(f"Failed to download {url}: {err}") from err
return _extract_ecws(zip_path, tile, ecw_dir)
except (
urllib.error.URLError,
TimeoutError,
ConnectionError,
zipfile.BadZipFile,
) as err:
last_err = err
zip_path.unlink(missing_ok=True)
print(
f" {tile.os_tile_id}: download failed after {retries + 1} attempt(s), "
f"skipping ({last_err})",
flush=True,
)
return []
def _extract_ecws(zip_path: Path, tile: VapTile, ecw_dir: Path) -> list[Path]:
"""Extract every ECW raster from a downloaded survey zip; delete the zip."""
extracted: list[Path] = []
with zipfile.ZipFile(zip_path) as archive:
for member in archive.infolist():

View file

@ -1,3 +1,6 @@
import io
import zipfile
from pipeline.download import satellite_highres
from pipeline.download.satellite_highres import (
VapTile,
@ -95,3 +98,76 @@ def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
def test_select_best_rgb_empty_when_no_rgb() -> None:
payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
assert select_best_rgb_tiles(parse_search_results(payload)) == []
_TILE = VapTile(
product_id=satellite_highres.VAP_RGB_PRODUCT,
year=2008,
resolution_m=0.4,
os_tile_id="TQ2575",
uri="https://example.invalid/tile",
label="t",
)
def _zip_with_ecw() -> bytes:
"""A minimal valid survey zip carrying one ECW member."""
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("TQ2575.ecw", b"ecw-pixels")
archive.writestr("readme.txt", b"ignored")
return buffer.getvalue()
def _fake_urlopen(bodies: list[bytes]):
"""Return a urlopen stand-in that yields each body in turn as the response."""
queue = list(bodies)
def opener(_request, timeout=None): # noqa: ANN001 - matches urlopen signature
return io.BytesIO(queue.pop(0))
return opener
def test_download_extracts_ecw_from_valid_zip(tmp_path, monkeypatch) -> None:
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([_zip_with_ecw()]),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"]
assert paths[0].read_bytes() == b"ecw-pixels"
# The transient zip is cleaned up regardless.
assert not (tmp_path / "TQ2575.zip").exists()
def test_download_retries_past_a_corrupt_body(tmp_path, monkeypatch) -> None:
# First response is a non-zip error page; the retry serves a real zip.
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([b"<html>rate limited</html>", _zip_with_ecw()]),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"]
def test_download_skips_tile_when_all_attempts_are_corrupt(
tmp_path, monkeypatch
) -> None:
# Every attempt returns a non-zip body: the tile is skipped, not raised.
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([b"not a zip"] * 3),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert paths == []
assert list(tmp_path.glob("*")) == [] # no leftover zip or partial ecw