97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
"""Download INSPIRE Index Polygons from HM Land Registry.
|
|
|
|
Downloads GML files for all local authorities from the INSPIRE download page.
|
|
Each ZIP contains a GML file with title extent polygons for that authority.
|
|
|
|
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
|
License: INSPIRE End User Licence
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
from tqdm import tqdm
|
|
|
|
BASE = "https://use-land-property-data.service.gov.uk"
|
|
INDEX_URL = f"{BASE}/datasets/inspire/download"
|
|
|
|
|
|
def get_zip_urls() -> list[str]:
|
|
"""Scrape the INSPIRE download page for all .zip hrefs."""
|
|
# The site requires a cookie jar to avoid redirect loops.
|
|
with httpx.Client(
|
|
follow_redirects=True,
|
|
timeout=httpx.Timeout(30.0, read=60),
|
|
headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
|
|
) as client:
|
|
resp = client.get(INDEX_URL)
|
|
resp.raise_for_status()
|
|
html = resp.text
|
|
|
|
pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
|
|
paths = sorted(set(re.findall(pattern, html)))
|
|
return [f"{BASE}{p}" for p in paths]
|
|
|
|
|
|
def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
|
|
"""Download a single ZIP file. Returns the filename."""
|
|
name = url.rsplit("/", 1)[-1]
|
|
dest = output_dir / name
|
|
if dest.exists():
|
|
return f"{name} (skipped, exists)"
|
|
|
|
resp = client.get(url)
|
|
resp.raise_for_status()
|
|
dest.write_bytes(resp.content)
|
|
return name
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download INSPIRE Index Polygon GML files"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
required=True,
|
|
help="Output directory for downloaded ZIPs",
|
|
)
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=8,
|
|
help="Number of parallel downloads (default: 8)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
args.output.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("Fetching download index...")
|
|
urls = get_zip_urls()
|
|
print(f"Found {len(urls)} files to download")
|
|
|
|
with (
|
|
httpx.Client(
|
|
follow_redirects=True,
|
|
timeout=httpx.Timeout(30.0, read=120),
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
) as client,
|
|
tqdm(total=len(urls), unit="file") as pbar,
|
|
):
|
|
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
futures = {
|
|
pool.submit(download_one, url, args.output, client): url for url in urls
|
|
}
|
|
for future in as_completed(futures):
|
|
result = future.result()
|
|
pbar.set_postfix_str(result[:40])
|
|
pbar.update(1)
|
|
|
|
print(f"Done. {len(urls)} files in {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|