"""Download INSPIRE Index Polygons from HM Land Registry. Downloads GML files for all local authorities from the INSPIRE download page. Each ZIP contains a GML file with title extent polygons for that authority. Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download License: INSPIRE End User Licence """ import argparse import re from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import httpx from tqdm import tqdm BASE = "https://use-land-property-data.service.gov.uk" INDEX_URL = f"{BASE}/datasets/inspire/download" def get_zip_urls() -> list[str]: """Scrape the INSPIRE download page for all .zip hrefs.""" # The site requires a cookie jar to avoid redirect loops. with httpx.Client( follow_redirects=True, timeout=httpx.Timeout(30.0, read=60), headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"}, ) as client: resp = client.get(INDEX_URL) resp.raise_for_status() html = resp.text pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"' paths = sorted(set(re.findall(pattern, html))) return [f"{BASE}{p}" for p in paths] def download_one(url: str, output_dir: Path, client: httpx.Client) -> str: """Download a single ZIP file. Returns the filename.""" name = url.rsplit("/", 1)[-1] dest = output_dir / name if dest.exists(): return f"{name} (skipped, exists)" resp = client.get(url) resp.raise_for_status() dest.write_bytes(resp.content) return name def main() -> None: parser = argparse.ArgumentParser( description="Download INSPIRE Index Polygon GML files" ) parser.add_argument( "--output", type=Path, required=True, help="Output directory for downloaded ZIPs", ) parser.add_argument( "--workers", type=int, default=8, help="Number of parallel downloads (default: 8)", ) args = parser.parse_args() args.output.mkdir(parents=True, exist_ok=True) print("Fetching download index...") urls = get_zip_urls() print(f"Found {len(urls)} files to download") with ( httpx.Client( follow_redirects=True, timeout=httpx.Timeout(30.0, read=120), headers={"User-Agent": "Mozilla/5.0"}, ) as client, tqdm(total=len(urls), unit="file") as pbar, ): with ThreadPoolExecutor(max_workers=args.workers) as pool: futures = { pool.submit(download_one, url, args.output, client): url for url in urls } for future in as_completed(futures): result = future.result() pbar.set_postfix_str(result[:40]) pbar.update(1) print(f"Done. {len(urls)} files in {args.output}") if __name__ == "__main__": main()