scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -4,7 +4,10 @@ Downloads GML files for all local authorities from the INSPIRE download page.
|
|||
Each ZIP contains a GML file with title extent polygons for that authority.
|
||||
|
||||
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
||||
License: INSPIRE End User Licence
|
||||
License: Open Government Licence v3.0 (since 1 July 2020, under the PSGA).
|
||||
Requires HM Land Registry + Ordnance Survey (AC0000851063) attribution; see
|
||||
the conditions page at the source URL. Boundaries are indicative "general
|
||||
boundaries", not the legal extent of title.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
|
|||
505
pipeline/download/satellite_highres.py
Normal file
505
pipeline/download/satellite_highres.py
Normal file
|
|
@ -0,0 +1,505 @@
|
|||
"""Build a high-resolution England aerial PMTiles archive from EA Vertical Aerial Photography.
|
||||
|
||||
The Environment Agency / Defra Vertical Aerial Photography (VAP) archive is open
|
||||
(OGL v3.0) RGB orthophotography at 10-50 cm, distributed as 5 km ECW tiles on the
|
||||
British National Grid. There is no public imagery tile service, so we mirror the
|
||||
Sentinel-2 ``satellite.pmtiles`` approach: query the Defra survey download API for
|
||||
an area of interest, pick the best RGB capture per OS tile, download and decode the
|
||||
ECW rasters, re-tile them into Web-Mercator raster tiles, and bake a single PMTiles
|
||||
archive that the server stacks *over* the Sentinel-2 base where coverage exists.
|
||||
|
||||
ECW decoding needs a GDAL build that includes the (free, read-only) ERDAS ECW/JP2
|
||||
SDK, which is not present in the rasterio wheel. The mosaic + tiling step therefore
|
||||
runs inside a GDAL-with-ECW Docker image (see ``docker/gdal-ecw/Dockerfile``); the
|
||||
rest of the pipeline is plain Python plus the ``pmtiles`` CLI.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.download.tiles import ensure_pmtiles_cli
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
|
||||
# Defra Data Services Platform survey download API (reverse-engineered from the
|
||||
# environment.data.gov.uk/survey front-end; no official API is documented).
|
||||
SEARCH_URL = (
|
||||
"https://environment.data.gov.uk/backend/catalog/api/tiles/collections/survey/search"
|
||||
)
|
||||
SURVEY_PAGE_URL = "https://environment.data.gov.uk/survey"
|
||||
# Static public key baked into the survey page JS. May rotate -- we try to scrape a
|
||||
# fresh one from the page and only fall back to this literal.
|
||||
DEFAULT_SUBSCRIPTION_KEY = "dspui"
|
||||
SUBSCRIPTION_KEY_RE = re.compile(r"subscription-key=([A-Za-z0-9]+)")
|
||||
|
||||
# True-colour RGB product only (skip IRRGB near-infra-red and Night Time variants).
|
||||
VAP_RGB_PRODUCT = "vertical_aerial_photography_tiles_rgb"
|
||||
|
||||
# Greater London bounding box (lon/lat). The API only returns tiles where coverage
|
||||
# exists, so a generous bbox is fine -- it does not force blank downloads.
|
||||
DEFAULT_AOI: dict = {
|
||||
"type": "Polygon",
|
||||
"coordinates": [
|
||||
[
|
||||
[-0.55, 51.25],
|
||||
[0.30, 51.25],
|
||||
[0.30, 51.70],
|
||||
[-0.55, 51.70],
|
||||
[-0.55, 51.25],
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
DEFAULT_MIN_ZOOM = 14
|
||||
DEFAULT_MAX_ZOOM = 19
|
||||
# GDAL image with the ECW driver. The official OSGeo image does not ship ECW, so
|
||||
# this defaults to the locally-built image from docker/gdal-ecw/Dockerfile.
|
||||
DEFAULT_GDAL_IMAGE = "perfect-postcode/gdal-ecw:latest"
|
||||
USER_AGENT = "perfect-postcode-satellite-highres/1.0"
|
||||
ATTRIBUTION_TEMPLATE = (
|
||||
"Environment Agency Vertical Aerial Photography - "
|
||||
"© Environment Agency copyright and/or database right {year}. "
|
||||
"All rights reserved. Licensed under the Open Government Licence v3.0."
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VapTile:
|
||||
"""One survey download record from the Defra search API."""
|
||||
|
||||
product_id: str
|
||||
year: int
|
||||
resolution_m: float
|
||||
os_tile_id: str
|
||||
uri: str
|
||||
label: str
|
||||
|
||||
|
||||
def parse_search_results(payload: dict) -> list[VapTile]:
|
||||
"""Turn a raw search-API JSON payload into typed records."""
|
||||
tiles: list[VapTile] = []
|
||||
for result in payload.get("results", []):
|
||||
try:
|
||||
tiles.append(
|
||||
VapTile(
|
||||
product_id=result["product"]["id"],
|
||||
year=int(result["year"]["id"]),
|
||||
resolution_m=float(result["resolution"]["id"]),
|
||||
os_tile_id=result["tile"]["id"],
|
||||
uri=result["uri"],
|
||||
label=result.get("label", ""),
|
||||
)
|
||||
)
|
||||
except (KeyError, TypeError, ValueError):
|
||||
# Skip malformed records rather than failing the whole search.
|
||||
continue
|
||||
return tiles
|
||||
|
||||
|
||||
def select_best_rgb_tiles(tiles: list[VapTile]) -> list[VapTile]:
|
||||
"""Pick one RGB capture per OS tile: finest resolution, then latest year.
|
||||
|
||||
Pure function -- the unit test exercises this against a real-shaped payload.
|
||||
"""
|
||||
best: dict[str, VapTile] = {}
|
||||
for tile in tiles:
|
||||
if tile.product_id != VAP_RGB_PRODUCT:
|
||||
continue
|
||||
current = best.get(tile.os_tile_id)
|
||||
if current is None or _is_better(tile, current):
|
||||
best[tile.os_tile_id] = tile
|
||||
return [best[key] for key in sorted(best)]
|
||||
|
||||
|
||||
def _is_better(candidate: VapTile, incumbent: VapTile) -> bool:
|
||||
"""Finer resolution wins; ties broken by the most recent survey year."""
|
||||
if candidate.resolution_m != incumbent.resolution_m:
|
||||
return candidate.resolution_m < incumbent.resolution_m
|
||||
return candidate.year > incumbent.year
|
||||
|
||||
|
||||
def _http_get(url: str, timeout: float) -> bytes:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read()
|
||||
|
||||
|
||||
def resolve_subscription_key(explicit: str | None, timeout: float = 30.0) -> str:
|
||||
"""Use an explicit key, else scrape the survey page JS, else the known default."""
|
||||
if explicit:
|
||||
return explicit
|
||||
try:
|
||||
page = _http_get(SURVEY_PAGE_URL, timeout).decode("utf-8", "ignore")
|
||||
match = SUBSCRIPTION_KEY_RE.search(page)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# The key usually lives in a referenced JS chunk; scan the largest one.
|
||||
for chunk in re.findall(r'src="(/_next/static/[^"]+\.js)"', page):
|
||||
js = _http_get(f"https://environment.data.gov.uk{chunk}", timeout)
|
||||
match = SUBSCRIPTION_KEY_RE.search(js.decode("utf-8", "ignore"))
|
||||
if match:
|
||||
return match.group(1)
|
||||
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
||||
print(f"Could not scrape subscription key ({err}); using default", flush=True)
|
||||
return DEFAULT_SUBSCRIPTION_KEY
|
||||
|
||||
|
||||
def search_vap_tiles(aoi: dict, timeout: float = 60.0) -> list[VapTile]:
|
||||
"""POST the area-of-interest polygon and return the RGB tiles to download."""
|
||||
body = json.dumps(aoi).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
SEARCH_URL,
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": "application/geo+json",
|
||||
"Referer": SURVEY_PAGE_URL,
|
||||
"User-Agent": USER_AGENT,
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
payload = json.load(response)
|
||||
selected = select_best_rgb_tiles(parse_search_results(payload))
|
||||
print(
|
||||
f"Search returned {payload.get('count', 0)} records; "
|
||||
f"selected {len(selected)} RGB tile(s)",
|
||||
flush=True,
|
||||
)
|
||||
return selected
|
||||
|
||||
|
||||
def _download_and_extract(
|
||||
tile: VapTile, ecw_dir: Path, key: str, timeout: float, retries: int
|
||||
) -> list[Path]:
|
||||
"""Download one survey zip and extract its ECW raster(s)."""
|
||||
url = f"{tile.uri}?subscription-key={key}"
|
||||
zip_path = ecw_dir / f"{tile.os_tile_id}.zip"
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
urllib.request.Request(url, headers={"User-Agent": USER_AGENT}),
|
||||
timeout=timeout,
|
||||
) as response, zip_path.open("wb") as out:
|
||||
shutil.copyfileobj(response, out, length=1 << 20)
|
||||
break
|
||||
except (urllib.error.URLError, TimeoutError, ConnectionError) as err:
|
||||
last_error = err
|
||||
if attempt == retries:
|
||||
raise RuntimeError(f"Failed to download {url}: {err}") from err
|
||||
extracted: list[Path] = []
|
||||
with zipfile.ZipFile(zip_path) as archive:
|
||||
for member in archive.infolist():
|
||||
if member.is_dir() or not member.filename.lower().endswith(".ecw"):
|
||||
continue
|
||||
target = ecw_dir / f"{tile.os_tile_id}_{Path(member.filename).name}"
|
||||
with archive.open(member) as src, target.open("wb") as dst:
|
||||
shutil.copyfileobj(src, dst, length=1 << 20)
|
||||
extracted.append(target)
|
||||
zip_path.unlink(missing_ok=True)
|
||||
if not extracted:
|
||||
print(f" {tile.os_tile_id}: no ECW in archive (skipped)", flush=True)
|
||||
return extracted
|
||||
|
||||
|
||||
def download_tiles(
|
||||
tiles: list[VapTile],
|
||||
ecw_dir: Path,
|
||||
key: str,
|
||||
max_workers: int,
|
||||
timeout: float,
|
||||
retries: int,
|
||||
) -> list[Path]:
|
||||
"""Download every selected tile concurrently; return all extracted ECW paths."""
|
||||
ecw_dir.mkdir(parents=True, exist_ok=True)
|
||||
ecw_paths: list[Path] = []
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
_download_and_extract, tile, ecw_dir, key, timeout, retries
|
||||
): tile
|
||||
for tile in tiles
|
||||
}
|
||||
done = 0
|
||||
for future in as_completed(futures):
|
||||
tile = futures[future]
|
||||
ecw_paths.extend(future.result())
|
||||
done += 1
|
||||
print(
|
||||
f"Downloaded {done}/{len(tiles)} tiles "
|
||||
f"(latest: {tile.os_tile_id} {tile.resolution_m}m {tile.year})",
|
||||
flush=True,
|
||||
)
|
||||
return ecw_paths
|
||||
|
||||
|
||||
def _build_tiles_with_gdal(
|
||||
work_dir: Path,
|
||||
gdal_image: str,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
jobs: int,
|
||||
webp_quality: int,
|
||||
) -> Path:
|
||||
"""Mosaic the ECW rasters and emit XYZ WebP tiles inside the GDAL-with-ECW image.
|
||||
|
||||
Returns the host path of the generated ``xyz`` directory. We use lossy WebP with
|
||||
an alpha channel: ~6x smaller than lossless PNG for photographic imagery while
|
||||
keeping transparency, so coverage gaps stay see-through and the Sentinel-2 base
|
||||
shows through them.
|
||||
"""
|
||||
xyz_dir = work_dir / "xyz"
|
||||
# EA "RGB" ECWs are 4-band RGBA (band 4 is a constant-255 validity/alpha mask),
|
||||
# so we build a plain 4-band VRT (no -addalpha, which would make a 5th band and
|
||||
# exceed PNG's 4-band limit). We then:
|
||||
# * force EPSG:27700 -- the pixels are already British National Grid, and the
|
||||
# EPSG code lets PROJ apply the OSTN15 datum shift (grid ships in the image)
|
||||
# for metre-accurate reprojection to Web Mercator;
|
||||
# * label band 4 as alpha so gdal2tiles writes transparent PNGs. Inter-block
|
||||
# gaps the VRT fills with 0 then read as alpha=0 (transparent), letting the
|
||||
# Sentinel-2 base show through wherever VAP coverage is missing.
|
||||
script = (
|
||||
"set -euo pipefail; "
|
||||
"cd /work; "
|
||||
"gdalbuildvrt -resolution highest mosaic.vrt ecw/*.ecw; "
|
||||
"gdal_edit.py -a_srs EPSG:27700 "
|
||||
"-colorinterp_1 red -colorinterp_2 green -colorinterp_3 blue "
|
||||
"-colorinterp_4 alpha mosaic.vrt; "
|
||||
f"gdal2tiles.py --xyz --zoom={min_zoom}-{max_zoom} "
|
||||
f"--processes={jobs} --resampling=average --webviewer=none "
|
||||
f"--tiledriver=WEBP --webp-quality={webp_quality} "
|
||||
"mosaic.vrt xyz"
|
||||
)
|
||||
subprocess.run(
|
||||
[
|
||||
"docker",
|
||||
"run",
|
||||
"--rm",
|
||||
"-v",
|
||||
f"{work_dir.resolve()}:/work",
|
||||
gdal_image,
|
||||
"bash",
|
||||
"-c",
|
||||
script,
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
if not xyz_dir.exists():
|
||||
raise RuntimeError("gdal2tiles produced no output directory")
|
||||
return xyz_dir
|
||||
|
||||
|
||||
def _pack_xyz_to_mbtiles(
|
||||
xyz_dir: Path,
|
||||
mbtiles_path: Path,
|
||||
bounds: tuple[float, float, float, float],
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
attribution: str,
|
||||
) -> int:
|
||||
"""Pack a gdal2tiles XYZ WebP directory into an MBTiles SQLite file (TMS rows)."""
|
||||
if mbtiles_path.exists():
|
||||
mbtiles_path.unlink()
|
||||
conn = sqlite3.connect(mbtiles_path)
|
||||
try:
|
||||
conn.execute("PRAGMA journal_mode = WAL")
|
||||
conn.execute("PRAGMA synchronous = NORMAL")
|
||||
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
|
||||
conn.execute(
|
||||
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
|
||||
"tile_row INTEGER, tile_data BLOB)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE UNIQUE INDEX tile_index ON tiles "
|
||||
"(zoom_level, tile_column, tile_row)"
|
||||
)
|
||||
conn.executemany(
|
||||
"INSERT INTO metadata (name, value) VALUES (?, ?)",
|
||||
[
|
||||
("name", "EA Vertical Aerial Photography"),
|
||||
("type", "overlay"),
|
||||
("version", "1"),
|
||||
("description", "Environment Agency high-resolution aerial imagery"),
|
||||
("format", "webp"),
|
||||
("attribution", attribution),
|
||||
("bounds", ",".join(f"{value:.6f}" for value in bounds)),
|
||||
("minzoom", str(min_zoom)),
|
||||
("maxzoom", str(max_zoom)),
|
||||
],
|
||||
)
|
||||
inserted = 0
|
||||
for zoom_dir in sorted(xyz_dir.iterdir()):
|
||||
if not zoom_dir.is_dir() or not zoom_dir.name.isdigit():
|
||||
continue
|
||||
zoom = int(zoom_dir.name)
|
||||
for col_dir in zoom_dir.iterdir():
|
||||
if not col_dir.is_dir() or not col_dir.name.isdigit():
|
||||
continue
|
||||
col = int(col_dir.name)
|
||||
for tile_file in col_dir.glob("*.webp"):
|
||||
if not tile_file.stem.isdigit():
|
||||
continue
|
||||
row = int(tile_file.stem)
|
||||
tms_row = (1 << zoom) - 1 - row
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)",
|
||||
(zoom, col, tms_row, tile_file.read_bytes()),
|
||||
)
|
||||
inserted += 1
|
||||
if inserted % 5000 == 0:
|
||||
conn.commit()
|
||||
print(f" packed {inserted:,} tiles", flush=True)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return inserted
|
||||
|
||||
|
||||
def build_satellite_highres_tiles(
|
||||
output_path: Path,
|
||||
pmtiles_bin: Path,
|
||||
pmtiles_version: str,
|
||||
aoi: dict,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
gdal_image: str,
|
||||
subscription_key: str | None,
|
||||
max_workers: int,
|
||||
timeout: float,
|
||||
retries: int,
|
||||
jobs: int,
|
||||
webp_quality: int,
|
||||
) -> None:
|
||||
if min_zoom > max_zoom:
|
||||
raise ValueError("--min-zoom must be <= --max-zoom")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
|
||||
|
||||
tiles = search_vap_tiles(aoi)
|
||||
if not tiles:
|
||||
raise RuntimeError("No RGB Vertical Aerial Photography tiles for the AOI")
|
||||
key = resolve_subscription_key(subscription_key)
|
||||
attribution = ATTRIBUTION_TEMPLATE.format(year=max(tile.year for tile in tiles))
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||
work_dir = Path(tmp)
|
||||
ecw_dir = work_dir / "ecw"
|
||||
ecw_paths = download_tiles(
|
||||
tiles, ecw_dir, key, max_workers, timeout, retries
|
||||
)
|
||||
if not ecw_paths:
|
||||
raise RuntimeError("No ECW rasters were extracted from the downloads")
|
||||
|
||||
xyz_dir = _build_tiles_with_gdal(
|
||||
work_dir, gdal_image, min_zoom, max_zoom, jobs, webp_quality
|
||||
)
|
||||
|
||||
mbtiles_path = work_dir / "satellite_highres.mbtiles"
|
||||
bounds = _aoi_bounds(aoi)
|
||||
inserted = _pack_xyz_to_mbtiles(
|
||||
xyz_dir, mbtiles_path, bounds, min_zoom, max_zoom, attribution
|
||||
)
|
||||
if inserted == 0:
|
||||
raise RuntimeError("Tiling produced no tiles to pack")
|
||||
print(f"Packed {inserted:,} tiles into MBTiles", flush=True)
|
||||
|
||||
subprocess.run(
|
||||
[str(pmtiles_bin), "convert", str(mbtiles_path), str(output_path), "--force"],
|
||||
check=True,
|
||||
)
|
||||
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {output_path} ({size_mb:.1f} MB) -- {attribution}", flush=True)
|
||||
|
||||
|
||||
def _aoi_bounds(aoi: dict) -> tuple[float, float, float, float]:
|
||||
coords = [point for ring in aoi["coordinates"] for point in ring]
|
||||
lons = [point[0] for point in coords]
|
||||
lats = [point[1] for point in coords]
|
||||
return min(lons), min(lats), max(lons), max(lats)
|
||||
|
||||
|
||||
def _load_aoi(path: Path | None) -> dict:
|
||||
if path is None:
|
||||
return DEFAULT_AOI
|
||||
data = json.loads(path.read_text())
|
||||
if data.get("type") == "FeatureCollection":
|
||||
return data["features"][0]["geometry"]
|
||||
if data.get("type") == "Feature":
|
||||
return data["geometry"]
|
||||
return data
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
parser.add_argument("--pmtiles-bin", type=Path, default=Path("property-data/pmtiles"))
|
||||
parser.add_argument("--pmtiles-version", default="1.22.3")
|
||||
parser.add_argument(
|
||||
"--aoi-geojson",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="GeoJSON Polygon/Feature/FeatureCollection for the area of interest "
|
||||
"(default: Greater London)",
|
||||
)
|
||||
parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM)
|
||||
parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM)
|
||||
parser.add_argument(
|
||||
"--gdal-image",
|
||||
default=DEFAULT_GDAL_IMAGE,
|
||||
help="Docker image with a GDAL that has the ECW driver",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--subscription-key",
|
||||
default=None,
|
||||
help="Override the Defra survey API key (default: scrape, then 'dspui')",
|
||||
)
|
||||
parser.add_argument("--max-workers", type=int, default=4)
|
||||
parser.add_argument("--timeout", type=float, default=600.0)
|
||||
parser.add_argument("--retries", type=int, default=3)
|
||||
parser.add_argument(
|
||||
"--jobs",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Parallel processes for gdal2tiles",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--webp-quality",
|
||||
type=int,
|
||||
default=85,
|
||||
help="WebP tile quality (1-100); lower is smaller",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
build_satellite_highres_tiles(
|
||||
output_path=args.output,
|
||||
pmtiles_bin=args.pmtiles_bin,
|
||||
pmtiles_version=args.pmtiles_version,
|
||||
aoi=_load_aoi(args.aoi_geojson),
|
||||
min_zoom=args.min_zoom,
|
||||
max_zoom=args.max_zoom,
|
||||
gdal_image=args.gdal_image,
|
||||
subscription_key=args.subscription_key,
|
||||
max_workers=max(1, args.max_workers),
|
||||
timeout=args.timeout,
|
||||
retries=max(0, args.retries),
|
||||
jobs=max(1, args.jobs),
|
||||
webp_quality=args.webp_quality,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
97
pipeline/download/test_satellite_highres.py
Normal file
97
pipeline/download/test_satellite_highres.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
from pipeline.download import satellite_highres
|
||||
from pipeline.download.satellite_highres import (
|
||||
VapTile,
|
||||
parse_search_results,
|
||||
select_best_rgb_tiles,
|
||||
)
|
||||
|
||||
|
||||
def _result(product: str, year: str, resolution: str, tile: str) -> dict:
|
||||
"""One search-API record in the real response shape."""
|
||||
return {
|
||||
"product": {"id": product, "label": product},
|
||||
"year": {"id": year, "label": year},
|
||||
"resolution": {"id": resolution, "label": f"{resolution}m"},
|
||||
"tile": {"id": tile, "label": tile},
|
||||
"label": f"{product}-{year}-{resolution}m-{tile}",
|
||||
"uri": (
|
||||
"https://environment.data.gov.uk/tiles/collections/survey/"
|
||||
f"{product}/{year}/{resolution}/{tile}"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# Mirrors a real Greater-London response: RGB at 0.4m (2008) and 0.1m (2011),
|
||||
# plus Night Time and LIDAR products that must be ignored.
|
||||
SAMPLE_PAYLOAD = {
|
||||
"count": 6,
|
||||
"results": [
|
||||
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
|
||||
_result("vertical_aerial_photography_tiles_night_time", "2012", "0.2", "TQ2575"),
|
||||
_result("lidar_composite_dtm", "2022", "1", "TQ2575"),
|
||||
# TQ3080 has two RGB captures: a finer-but-older and a coarser-but-newer.
|
||||
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.1", "TQ3080"),
|
||||
_result("vertical_aerial_photography_tiles_rgb", "2011", "0.25", "TQ3080"),
|
||||
_result("vertical_aerial_photography_tiles_irrgb", "2012", "0.5", "TQ3080"),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_parse_search_results_skips_malformed_records() -> None:
|
||||
payload = {
|
||||
"results": [
|
||||
_result("vertical_aerial_photography_tiles_rgb", "2008", "0.4", "TQ2575"),
|
||||
{"product": {"id": "broken"}}, # missing year/resolution/tile/uri
|
||||
]
|
||||
}
|
||||
tiles = parse_search_results(payload)
|
||||
assert len(tiles) == 1
|
||||
assert tiles[0] == VapTile(
|
||||
product_id="vertical_aerial_photography_tiles_rgb",
|
||||
year=2008,
|
||||
resolution_m=0.4,
|
||||
os_tile_id="TQ2575",
|
||||
uri="https://environment.data.gov.uk/tiles/collections/survey/"
|
||||
"vertical_aerial_photography_tiles_rgb/2008/0.4/TQ2575",
|
||||
label="vertical_aerial_photography_tiles_rgb-2008-0.4m-TQ2575",
|
||||
)
|
||||
|
||||
|
||||
def test_select_best_rgb_filters_non_rgb_products() -> None:
|
||||
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||
assert {tile.product_id for tile in selected} == {
|
||||
satellite_highres.VAP_RGB_PRODUCT
|
||||
}
|
||||
|
||||
|
||||
def test_select_best_rgb_one_tile_per_os_square() -> None:
|
||||
selected = select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||
assert sorted(tile.os_tile_id for tile in selected) == ["TQ2575", "TQ3080"]
|
||||
|
||||
|
||||
def test_select_best_rgb_prefers_finest_resolution_then_latest_year() -> None:
|
||||
selected = {
|
||||
tile.os_tile_id: tile
|
||||
for tile in select_best_rgb_tiles(parse_search_results(SAMPLE_PAYLOAD))
|
||||
}
|
||||
# TQ2575: only one RGB capture.
|
||||
assert selected["TQ2575"].resolution_m == 0.4
|
||||
# TQ3080: finest resolution (0.1m) wins even though it is the older survey.
|
||||
assert selected["TQ3080"].resolution_m == 0.1
|
||||
assert selected["TQ3080"].year == 2008
|
||||
|
||||
|
||||
def test_select_best_rgb_breaks_resolution_ties_by_year() -> None:
|
||||
tiles = [
|
||||
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2009, 0.25, "TQ0101", "u", "a"),
|
||||
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2018, 0.25, "TQ0101", "u", "b"),
|
||||
VapTile(satellite_highres.VAP_RGB_PRODUCT, 2015, 0.25, "TQ0101", "u", "c"),
|
||||
]
|
||||
selected = select_best_rgb_tiles(tiles)
|
||||
assert len(selected) == 1
|
||||
assert selected[0].year == 2018
|
||||
|
||||
|
||||
def test_select_best_rgb_empty_when_no_rgb() -> None:
|
||||
payload = {"results": [_result("lidar_composite_dtm", "2022", "1", "TQ2575")]}
|
||||
assert select_best_rgb_tiles(parse_search_results(payload)) == []
|
||||
Loading…
Add table
Add a link
Reference in a new issue