perfect-postcode/pipeline/download/test_satellite_highres.py
2026-06-06 10:45:45 +01:00

173 lines
6.1 KiB
Python

import io
import zipfile
from pipeline.download import satellite_highres
from pipeline.download.satellite_highres import (
VapTile,
parse_search_results,
select_best_rgb_tiles,
)
def _result(product: str, year: str, resolution: str, tile: str) -> dict:
"""One search-API record in the real response shape."""
return {
"product": {"id": product, "label": product},
"year": {"id": year, "label": year},
"resolution": {"id": resolution, "label": f"{resolution}m"},
"tile": {"id": tile, "label": tile},
"label": f"{product}-{year}-{resolution}m-{tile}",
"uri": (
"https://environment.data.gov.uk/tiles/collections/survey/"
f"{product}/{year}/{resolution}/{tile}"
),
}
# Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011),
# plus Night Time and LIDAR products that must be ignored.
SAMPLE_PAYLOAD = {
"count": 6,
"results": [
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
_result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"),
_result("lidar_composite_dtm", "2022", "1", "TQ2575"),
# TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer.
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"),
_result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"),
_result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"),
],
}
def test_parse_search_results_skips_malformed_records() -> None:
payload = {
"results": [
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
{"product": {"id": "broken"}}, # missing year/resolution/tile/uri
]
}
tiles = parse_search_results(payload)
assert len(tiles) == 1
assert tiles[0] == VapTile(
product_id="vertical_aerial_photography_tiles_rgb",
year=2008,
resolution_m=0.4,
os_tile_id="TQ2575",
uri="https://environment.data.gov.uk/tiles/collections/survey/"
"vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575",
label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575",
)
def test_select_best_rgb_filters_non_rgb_products() -> None:
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
assert {tile.product_id for tile in selected} == {
satellite_highres.VAP_RGB_PRODUCT
}
def test_select_best_rgb_one_tile_per_os_square() -> None:
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"]
def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None:
selected = {
tile.os_tile_id: tile
for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
}
# TQ2575: only one RGB capture.
assert selected["TQ2575"].resolution_m == 0.4
# TQ3080: finest resolution (0.1m) wins even though it is the older survey.
assert selected["TQ3080"].resolution_m == 0.1
assert selected["TQ3080"].year == 2008
def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
tiles = [
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"),
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"),
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"),
]
selected = select_best_rgb_tiles(tiles)
assert len(selected) == 1
assert selected[0].year == 2018
def test_select_best_rgb_empty_when_no_rgb() -> None:
payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
assert select_best_rgb_tiles(parse_search_results(payload)) == []
_TILE = VapTile(
product_id=satellite_highres.VAP_RGB_PRODUCT,
year=2008,
resolution_m=0.4,
os_tile_id="TQ2575",
uri="https://example.invalid/tile",
label="t",
)
def _zip_with_ecw() -> bytes:
"""A minimal valid survey zip carrying one ECW member."""
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("TQ2575.ecw", b"ecw-pixels")
archive.writestr("readme.txt", b"ignored")
return buffer.getvalue()
def _fake_urlopen(bodies: list[bytes]):
"""Return a urlopen stand-in that yields each body in turn as the response."""
queue = list(bodies)
def opener(_request, timeout=None): # noqa: ANN001 - matches urlopen signature
return io.BytesIO(queue.pop(0))
return opener
def test_download_extracts_ecw_from_valid_zip(tmp_path, monkeypatch) -> None:
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([_zip_with_ecw()]),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"]
assert paths[0].read_bytes() == b"ecw-pixels"
# The transient zip is cleaned up regardless.
assert not (tmp_path / "TQ2575.zip").exists()
def test_download_retries_past_a_corrupt_body(tmp_path, monkeypatch) -> None:
# First response is a non-zip error page; the retry serves a real zip.
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([b"<html>rate limited</html>", _zip_with_ecw()]),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert [p.name for p in paths] == ["TQ2575_TQ2575.ecw"]
def test_download_skips_tile_when_all_attempts_are_corrupt(
tmp_path, monkeypatch
) -> None:
# Every attempt returns a non-zip body: the tile is skipped, not raised.
monkeypatch.setattr(
satellite_highres.urllib.request,
"urlopen",
_fake_urlopen([b"not a zip"] * 3),
)
paths = satellite_highres._download_and_extract(
_TILE, tmp_path, key="k", timeout=1.0, retries=2
)
assert paths == []
assert list(tmp_path.glob("*")) == [] # no leftover zip or partial ecw