perfect-postcode/pipeline/utils/download.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

131 lines
4.7 KiB
Python

"""Shared download and extraction helpers for pipeline scripts."""
import time
import zipfile
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
from tqdm import tqdm
# Census 2021 LSOAs (TYPE151) with an E prefix. The Census 2021 geography is
# frozen, so NOMIS England-level downloads must yield exactly this many LSOAs;
# fewer means the download was truncated.
ENGLAND_LSOA_COUNT_2021 = 33_755
def download(url: str, output_path: Path, *, timeout: float = 120) -> None:
"""Stream-download a URL to a local file with a tqdm progress bar."""
with httpx.stream(
"GET",
url,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=timeout),
) as response:
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
total = int(response.headers.get("content-length", 0))
with (
open(output_path, "wb") as f,
tqdm(
total=total or None,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=output_path.name,
) as pbar,
):
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
pbar.update(len(chunk))
def extract_zip(zip_path: Path, extract_dir: Path) -> None:
"""Extract a ZIP archive into the given directory."""
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
def download_nomis_csv(base_url: str, *, page_size: int = 25_000) -> pl.DataFrame:
"""Download a NOMIS CSV dataset, paging with recordoffset/RecordLimit.
The page size is sent explicitly as ``RecordLimit``: last-page detection is
``rows < page_size``, so relying on NOMIS's implicit default would silently
truncate the dataset to one page if that default ever differed.
"""
frames = []
offset = 0
while True:
url = f"{base_url}&RecordLimit={page_size}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < page_size:
break
offset += page_size
if not frames:
raise RuntimeError(f"NOMIS returned no rows for {base_url}")
return pl.concat(frames)
def download_arcgis_hub_export(
url: str,
output_path: Path,
*,
expected_geometry: str | None = None,
poll_interval_s: float = 5,
poll_timeout_s: float = 600,
) -> int:
"""Download an ArcGIS Hub `api/download/v1` export, handling deferred jobs.
The endpoint returns HTTP 202 with a JSON status body while the export is
still being prepared; a plain download would save that placeholder as the
output file with a success exit code. Poll until the file is ready, then
validate the result with pyogrio (feature count > 0 and, optionally, a
geometry-type substring) before moving it into place. Returns the feature
count.
"""
import pyogrio
tmp_path = output_path.with_name(f"{output_path.stem}.tmp{output_path.suffix}")
deadline = time.monotonic() + poll_timeout_s
with httpx.Client(follow_redirects=True, timeout=300) as client:
while True:
with client.stream("GET", url) as response:
if response.status_code == 202:
response.read()
if time.monotonic() > deadline:
raise TimeoutError(
f"Export did not finish within {poll_timeout_s}s: "
f"{response.text}"
)
time.sleep(poll_interval_s)
continue
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
with tmp_path.open("wb") as fh:
for chunk in response.iter_bytes():
fh.write(chunk)
break
info = pyogrio.read_info(tmp_path)
features = int(info.get("features", 0))
geometry_type = str(info.get("geometry_type") or "")
if features <= 0:
raise ValueError(f"Downloaded file {output_path.name} contains no features")
if expected_geometry is not None and expected_geometry not in geometry_type:
raise ValueError(
f"Expected {expected_geometry!r} geometry in {output_path.name}, "
f"got {geometry_type!r}"
)
tmp_path.replace(output_path)
return features