Add postcode mapping
This commit is contained in:
parent
e7f2d1ffc3
commit
4506263e5b
5 changed files with 966 additions and 0 deletions
97
pipeline/download/inspire.py
Normal file
97
pipeline/download/inspire.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
"""Download INSPIRE Index Polygons from HM Land Registry.
|
||||
|
||||
Downloads GML files for all local authorities from the INSPIRE download page.
|
||||
Each ZIP contains a GML file with title extent polygons for that authority.
|
||||
|
||||
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
||||
License: INSPIRE End User Licence
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from tqdm import tqdm
|
||||
|
||||
BASE = "https://use-land-property-data.service.gov.uk"
|
||||
INDEX_URL = f"{BASE}/datasets/inspire/download"
|
||||
|
||||
|
||||
def get_zip_urls() -> list[str]:
|
||||
"""Scrape the INSPIRE download page for all .zip hrefs."""
|
||||
# The site requires a cookie jar to avoid redirect loops.
|
||||
with httpx.Client(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(30.0, read=60),
|
||||
headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
|
||||
) as client:
|
||||
resp = client.get(INDEX_URL)
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
|
||||
pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
|
||||
paths = sorted(set(re.findall(pattern, html)))
|
||||
return [f"{BASE}{p}" for p in paths]
|
||||
|
||||
|
||||
def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
|
||||
"""Download a single ZIP file. Returns the filename."""
|
||||
name = url.rsplit("/", 1)[-1]
|
||||
dest = output_dir / name
|
||||
if dest.exists():
|
||||
return f"{name} (skipped, exists)"
|
||||
|
||||
resp = client.get(url)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
return name
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download INSPIRE Index Polygon GML files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for downloaded ZIPs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of parallel downloads (default: 8)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Fetching download index...")
|
||||
urls = get_zip_urls()
|
||||
print(f"Found {len(urls)} files to download")
|
||||
|
||||
with (
|
||||
httpx.Client(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(30.0, read=120),
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
) as client,
|
||||
tqdm(total=len(urls), unit="file") as pbar,
|
||||
):
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
||||
futures = {
|
||||
pool.submit(download_one, url, args.output, client): url for url in urls
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
pbar.set_postfix_str(result[:40])
|
||||
pbar.update(1)
|
||||
|
||||
print(f"Done. {len(urls)} files in {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue