131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
"""Shared download and extraction helpers for pipeline scripts."""
|
|
|
|
import time
|
|
import zipfile
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import polars as pl
|
|
from tqdm import tqdm
|
|
|
|
# Census 2021 LSOAs (TYPE151) with an E prefix. The Census 2021 geography is
|
|
# frozen, so NOMIS England-level downloads must yield exactly this many LSOAs;
|
|
# fewer means the download was truncated.
|
|
ENGLAND_LSOA_COUNT_2021 = 33_755
|
|
|
|
|
|
def download(url: str, output_path: Path, *, timeout: float = 120) -> None:
|
|
"""Stream-download a URL to a local file with a tqdm progress bar."""
|
|
with httpx.stream(
|
|
"GET",
|
|
url,
|
|
follow_redirects=True,
|
|
timeout=httpx.Timeout(30.0, read=timeout),
|
|
) as response:
|
|
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
|
|
total = int(response.headers.get("content-length", 0))
|
|
|
|
with (
|
|
open(output_path, "wb") as f,
|
|
tqdm(
|
|
total=total or None,
|
|
unit="B",
|
|
unit_scale=True,
|
|
unit_divisor=1024,
|
|
desc=output_path.name,
|
|
) as pbar,
|
|
):
|
|
for chunk in response.iter_bytes(chunk_size=8192):
|
|
f.write(chunk)
|
|
pbar.update(len(chunk))
|
|
|
|
|
|
def extract_zip(zip_path: Path, extract_dir: Path) -> None:
|
|
"""Extract a ZIP archive into the given directory."""
|
|
extract_dir.mkdir(parents=True, exist_ok=True)
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
zf.extractall(extract_dir)
|
|
|
|
|
|
def download_nomis_csv(base_url: str, *, page_size: int = 25_000) -> pl.DataFrame:
|
|
"""Download a NOMIS CSV dataset, paging with recordoffset/RecordLimit.
|
|
|
|
The page size is sent explicitly as ``RecordLimit``: last-page detection is
|
|
``rows < page_size``, so relying on NOMIS's implicit default would silently
|
|
truncate the dataset to one page if that default ever differed.
|
|
"""
|
|
frames = []
|
|
offset = 0
|
|
while True:
|
|
url = f"{base_url}&RecordLimit={page_size}&recordoffset={offset}"
|
|
response = httpx.get(url, follow_redirects=True, timeout=120)
|
|
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
|
|
if len(response.content) == 0:
|
|
break
|
|
chunk = pl.read_csv(BytesIO(response.content))
|
|
if chunk.height == 0:
|
|
break
|
|
frames.append(chunk)
|
|
print(f" Fetched {chunk.height} rows (offset={offset})")
|
|
if chunk.height < page_size:
|
|
break
|
|
offset += page_size
|
|
|
|
if not frames:
|
|
raise RuntimeError(f"NOMIS returned no rows for {base_url}")
|
|
return pl.concat(frames)
|
|
|
|
|
|
def download_arcgis_hub_export(
|
|
url: str,
|
|
output_path: Path,
|
|
*,
|
|
expected_geometry: str | None = None,
|
|
poll_interval_s: float = 5,
|
|
poll_timeout_s: float = 600,
|
|
) -> int:
|
|
"""Download an ArcGIS Hub `api/download/v1` export, handling deferred jobs.
|
|
|
|
The endpoint returns HTTP 202 with a JSON status body while the export is
|
|
still being prepared; a plain download would save that placeholder as the
|
|
output file with a success exit code. Poll until the file is ready, then
|
|
validate the result with pyogrio (feature count > 0 and, optionally, a
|
|
geometry-type substring) before moving it into place. Returns the feature
|
|
count.
|
|
"""
|
|
import pyogrio
|
|
|
|
tmp_path = output_path.with_name(f"{output_path.stem}.tmp{output_path.suffix}")
|
|
deadline = time.monotonic() + poll_timeout_s
|
|
with httpx.Client(follow_redirects=True, timeout=300) as client:
|
|
while True:
|
|
with client.stream("GET", url) as response:
|
|
if response.status_code == 202:
|
|
response.read()
|
|
if time.monotonic() > deadline:
|
|
raise TimeoutError(
|
|
f"Export did not finish within {poll_timeout_s}s: "
|
|
f"{response.text}"
|
|
)
|
|
time.sleep(poll_interval_s)
|
|
continue
|
|
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
|
|
with tmp_path.open("wb") as fh:
|
|
for chunk in response.iter_bytes():
|
|
fh.write(chunk)
|
|
break
|
|
|
|
info = pyogrio.read_info(tmp_path)
|
|
features = int(info.get("features", 0))
|
|
geometry_type = str(info.get("geometry_type") or "")
|
|
if features <= 0:
|
|
raise ValueError(f"Downloaded file {output_path.name} contains no features")
|
|
if expected_geometry is not None and expected_geometry not in geometry_type:
|
|
raise ValueError(
|
|
f"Expected {expected_geometry!r} geometry in {output_path.name}, "
|
|
f"got {geometry_type!r}"
|
|
)
|
|
|
|
tmp_path.replace(output_path)
|
|
return features
|