perfect-postcode/pipeline/utils/download.py

"""Shared download and extraction helpers for pipeline scripts."""

import time
import zipfile
from io import BytesIO
from pathlib import Path

import httpx
import polars as pl
from tqdm import tqdm

# Census 2021 LSOAs (TYPE151) with an E prefix. The Census 2021 geography is
# frozen, so NOMIS England-level downloads must yield exactly this many LSOAs;
# fewer means the download was truncated.
ENGLAND_LSOA_COUNT_2021 = 33_755


def download(url: str, output_path: Path, *, timeout: float = 120) -> None:
    """Stream-download a URL to a local file with a tqdm progress bar."""
    with httpx.stream(
        "GET",
        url,
        follow_redirects=True,
        timeout=httpx.Timeout(30.0, read=timeout),
    ) as response:
        response.raise_for_status()  # pyright: ignore[reportUnusedCallResult]
        total = int(response.headers.get("content-length", 0))

        with (
            open(output_path, "wb") as f,
            tqdm(
                total=total or None,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
                desc=output_path.name,
            ) as pbar,
        ):
            for chunk in response.iter_bytes(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))


def extract_zip(zip_path: Path, extract_dir: Path) -> None:
    """Extract a ZIP archive into the given directory."""
    extract_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)


def download_nomis_csv(base_url: str, *, page_size: int = 25_000) -> pl.DataFrame:
    """Download a NOMIS CSV dataset, paging with recordoffset/RecordLimit.

    The page size is sent explicitly as ``RecordLimit``: last-page detection is
    ``rows < page_size``, so relying on NOMIS's implicit default would silently
    truncate the dataset to one page if that default ever differed.
    """
    frames = []
    offset = 0
    while True:
        url = f"{base_url}&RecordLimit={page_size}&recordoffset={offset}"
        response = httpx.get(url, follow_redirects=True, timeout=120)
        response.raise_for_status()  # pyright: ignore[reportUnusedCallResult]
        if len(response.content) == 0:
            break
        chunk = pl.read_csv(BytesIO(response.content))
        if chunk.height == 0:
            break
        frames.append(chunk)
        print(f"  Fetched {chunk.height} rows (offset={offset})")
        if chunk.height < page_size:
            break
        offset += page_size

    if not frames:
        raise RuntimeError(f"NOMIS returned no rows for {base_url}")
    return pl.concat(frames)


def download_arcgis_hub_export(
    url: str,
    output_path: Path,
    *,
    expected_geometry: str | None = None,
    poll_interval_s: float = 5,
    poll_timeout_s: float = 600,
) -> int:
    """Download an ArcGIS Hub `api/download/v1` export, handling deferred jobs.

    The endpoint returns HTTP 202 with a JSON status body while the export is
    still being prepared; a plain download would save that placeholder as the
    output file with a success exit code. Poll until the file is ready, then
    validate the result with pyogrio (feature count > 0 and, optionally, a
    geometry-type substring) before moving it into place. Returns the feature
    count.
    """
    import pyogrio

    tmp_path = output_path.with_name(f"{output_path.stem}.tmp{output_path.suffix}")
    deadline = time.monotonic() + poll_timeout_s
    with httpx.Client(follow_redirects=True, timeout=300) as client:
        while True:
            with client.stream("GET", url) as response:
                if response.status_code == 202:
                    response.read()
                    if time.monotonic() > deadline:
                        raise TimeoutError(
                            f"Export did not finish within {poll_timeout_s}s: "
                            f"{response.text}"
                        )
                    time.sleep(poll_interval_s)
                    continue
                response.raise_for_status()  # pyright: ignore[reportUnusedCallResult]
                with tmp_path.open("wb") as fh:
                    for chunk in response.iter_bytes():
                        fh.write(chunk)
                break

    info = pyogrio.read_info(tmp_path)
    features = int(info.get("features", 0))
    geometry_type = str(info.get("geometry_type") or "")
    if features <= 0:
        raise ValueError(f"Downloaded file {output_path.name} contains no features")
    if expected_geometry is not None and expected_geometry not in geometry_type:
        raise ValueError(
            f"Expected {expected_geometry!r} geometry in {output_path.name}, "
            f"got {geometry_type!r}"
        )

    tmp_path.replace(output_path)
    return features