"""Shared download and extraction helpers for pipeline scripts.""" import time import zipfile from io import BytesIO from pathlib import Path import httpx import polars as pl from tqdm import tqdm # Census 2021 LSOAs (TYPE151) with an E prefix. The Census 2021 geography is # frozen, so NOMIS England-level downloads must yield exactly this many LSOAs; # fewer means the download was truncated. ENGLAND_LSOA_COUNT_2021 = 33_755 def download(url: str, output_path: Path, *, timeout: float = 120) -> None: """Stream-download a URL to a local file with a tqdm progress bar.""" with httpx.stream( "GET", url, follow_redirects=True, timeout=httpx.Timeout(30.0, read=timeout), ) as response: response.raise_for_status() # pyright: ignore[reportUnusedCallResult] total = int(response.headers.get("content-length", 0)) with ( open(output_path, "wb") as f, tqdm( total=total or None, unit="B", unit_scale=True, unit_divisor=1024, desc=output_path.name, ) as pbar, ): for chunk in response.iter_bytes(chunk_size=8192): f.write(chunk) pbar.update(len(chunk)) def extract_zip(zip_path: Path, extract_dir: Path) -> None: """Extract a ZIP archive into the given directory.""" extract_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(zip_path, "r") as zf: zf.extractall(extract_dir) def download_nomis_csv(base_url: str, *, page_size: int = 25_000) -> pl.DataFrame: """Download a NOMIS CSV dataset, paging with recordoffset/RecordLimit. The page size is sent explicitly as ``RecordLimit``: last-page detection is ``rows < page_size``, so relying on NOMIS's implicit default would silently truncate the dataset to one page if that default ever differed. """ frames = [] offset = 0 while True: url = f"{base_url}&RecordLimit={page_size}&recordoffset={offset}" response = httpx.get(url, follow_redirects=True, timeout=120) response.raise_for_status() # pyright: ignore[reportUnusedCallResult] if len(response.content) == 0: break chunk = pl.read_csv(BytesIO(response.content)) if chunk.height == 0: break frames.append(chunk) print(f" Fetched {chunk.height} rows (offset={offset})") if chunk.height < page_size: break offset += page_size if not frames: raise RuntimeError(f"NOMIS returned no rows for {base_url}") return pl.concat(frames) def download_arcgis_hub_export( url: str, output_path: Path, *, expected_geometry: str | None = None, poll_interval_s: float = 5, poll_timeout_s: float = 600, ) -> int: """Download an ArcGIS Hub `api/download/v1` export, handling deferred jobs. The endpoint returns HTTP 202 with a JSON status body while the export is still being prepared; a plain download would save that placeholder as the output file with a success exit code. Poll until the file is ready, then validate the result with pyogrio (feature count > 0 and, optionally, a geometry-type substring) before moving it into place. Returns the feature count. """ import pyogrio tmp_path = output_path.with_name(f"{output_path.stem}.tmp{output_path.suffix}") deadline = time.monotonic() + poll_timeout_s with httpx.Client(follow_redirects=True, timeout=300) as client: while True: with client.stream("GET", url) as response: if response.status_code == 202: response.read() if time.monotonic() > deadline: raise TimeoutError( f"Export did not finish within {poll_timeout_s}s: " f"{response.text}" ) time.sleep(poll_interval_s) continue response.raise_for_status() # pyright: ignore[reportUnusedCallResult] with tmp_path.open("wb") as fh: for chunk in response.iter_bytes(): fh.write(chunk) break info = pyogrio.read_info(tmp_path) features = int(info.get("features", 0)) geometry_type = str(info.get("geometry_type") or "") if features <= 0: raise ValueError(f"Downloaded file {output_path.name} contains no features") if expected_geometry is not None and expected_geometry not in geometry_type: raise ValueError( f"Expected {expected_geometry!r} geometry in {output_path.name}, " f"got {geometry_type!r}" ) tmp_path.replace(output_path) return features