perfect-postcode/pipeline/download/inspire.py

97 lines
2.8 KiB
Python

"""Download INSPIRE Index Polygons from HM Land Registry.
Downloads GML files for all local authorities from the INSPIRE download page.
Each ZIP contains a GML file with title extent polygons for that authority.
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
License: INSPIRE End User Licence
"""
import argparse
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import httpx
from tqdm import tqdm
BASE = "https://use-land-property-data.service.gov.uk"
INDEX_URL = f"{BASE}/datasets/inspire/download"
def get_zip_urls() -> list[str]:
"""Scrape the INSPIRE download page for all .zip hrefs."""
# The site requires a cookie jar to avoid redirect loops.
with httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=60),
headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
) as client:
resp = client.get(INDEX_URL)
resp.raise_for_status()
html = resp.text
pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
paths = sorted(set(re.findall(pattern, html)))
return [f"{BASE}{p}" for p in paths]
def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
"""Download a single ZIP file. Returns the filename."""
name = url.rsplit("/", 1)[-1]
dest = output_dir / name
if dest.exists():
return f"{name} (skipped, exists)"
resp = client.get(url)
resp.raise_for_status()
dest.write_bytes(resp.content)
return name
def main() -> None:
parser = argparse.ArgumentParser(
description="Download INSPIRE Index Polygon GML files"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output directory for downloaded ZIPs",
)
parser.add_argument(
"--workers",
type=int,
default=8,
help="Number of parallel downloads (default: 8)",
)
args = parser.parse_args()
args.output.mkdir(parents=True, exist_ok=True)
print("Fetching download index...")
urls = get_zip_urls()
print(f"Found {len(urls)} files to download")
with (
httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=120),
headers={"User-Agent": "Mozilla/5.0"},
) as client,
tqdm(total=len(urls), unit="file") as pbar,
):
with ThreadPoolExecutor(max_workers=args.workers) as pool:
futures = {
pool.submit(download_one, url, args.output, client): url for url in urls
}
for future in as_completed(futures):
result = future.result()
pbar.set_postfix_str(result[:40])
pbar.update(1)
print(f"Done. {len(urls)} files in {args.output}")
if __name__ == "__main__":
main()