perfect-postcode/pipeline/download/inspire.py

"""Download INSPIRE Index Polygons from HM Land Registry.

Downloads GML files for all local authorities from the INSPIRE download page.
Each ZIP contains a GML file with title extent polygons for that authority.

Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
License: INSPIRE End User Licence
"""

import argparse
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

import httpx
from tqdm import tqdm

BASE = "https://use-land-property-data.service.gov.uk"
INDEX_URL = f"{BASE}/datasets/inspire/download"


def get_zip_urls() -> list[str]:
    """Scrape the INSPIRE download page for all .zip hrefs."""
    # The site requires a cookie jar to avoid redirect loops.
    with httpx.Client(
        follow_redirects=True,
        timeout=httpx.Timeout(30.0, read=60),
        headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
    ) as client:
        resp = client.get(INDEX_URL)
        resp.raise_for_status()
        html = resp.text

    pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
    paths = sorted(set(re.findall(pattern, html)))
    return [f"{BASE}{p}" for p in paths]


def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
    """Download a single ZIP file. Returns the filename."""
    name = url.rsplit("/", 1)[-1]
    dest = output_dir / name
    if dest.exists():
        return f"{name} (skipped, exists)"

    resp = client.get(url)
    resp.raise_for_status()
    dest.write_bytes(resp.content)
    return name


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download INSPIRE Index Polygon GML files"
    )
    parser.add_argument(
        "--output",
        type=Path,
        required=True,
        help="Output directory for downloaded ZIPs",
    )
    parser.add_argument(
        "--workers",
        type=int,
        default=8,
        help="Number of parallel downloads (default: 8)",
    )
    args = parser.parse_args()

    args.output.mkdir(parents=True, exist_ok=True)

    print("Fetching download index...")
    urls = get_zip_urls()
    print(f"Found {len(urls)} files to download")

    with (
        httpx.Client(
            follow_redirects=True,
            timeout=httpx.Timeout(30.0, read=120),
            headers={"User-Agent": "Mozilla/5.0"},
        ) as client,
        tqdm(total=len(urls), unit="file") as pbar,
    ):
        with ThreadPoolExecutor(max_workers=args.workers) as pool:
            futures = {
                pool.submit(download_one, url, args.output, client): url for url in urls
            }
            for future in as_completed(futures):
                result = future.result()
                pbar.set_postfix_str(result[:40])
                pbar.update(1)

    print(f"Done. {len(urls)} files in {args.output}")


if __name__ == "__main__":
    main()