68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
"""Download GB postcodes data from MapIt.
|
|
|
|
Downloads the gb-postcodes-v5.tar.bz2 archive containing UK postcode GeoJSON data
|
|
and extracts it to the specified output directory.
|
|
|
|
Source: https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2
|
|
"""
|
|
|
|
import argparse
|
|
import shutil
|
|
import tarfile
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from pipeline.local_temp import local_tmp_dir
|
|
from pipeline.utils import download
|
|
|
|
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
|
|
|
|
|
|
def extract_tar_bz2(archive_path: Path, extract_dir: Path) -> None:
|
|
"""Extract a tar.bz2 archive into the given directory."""
|
|
extract_dir.mkdir(parents=True, exist_ok=True)
|
|
print(f"Extracting {archive_path.name}...")
|
|
with tarfile.open(archive_path, "r:bz2") as tf:
|
|
tf.extractall(extract_dir, filter="data")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download GB postcodes data from MapIt"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
required=True,
|
|
help="Output directory for extracted GeoJSON files",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
|
cache = Path(cache_dir)
|
|
archive_path = cache / "gb-postcodes-v5.tar.bz2"
|
|
extract_dir = cache / "extracted"
|
|
|
|
download(URL, archive_path, timeout=600)
|
|
extract_tar_bz2(archive_path, extract_dir)
|
|
|
|
# Find the extracted directory (gb-postcodes-v5)
|
|
extracted_contents = list(extract_dir.iterdir())
|
|
if len(extracted_contents) == 1 and extracted_contents[0].is_dir():
|
|
source_dir = extracted_contents[0]
|
|
else:
|
|
source_dir = extract_dir
|
|
|
|
# Move to output directory
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
if args.output.exists():
|
|
shutil.rmtree(args.output)
|
|
shutil.move(str(source_dir), str(args.output))
|
|
|
|
# Count extracted files
|
|
geojson_files = list(args.output.rglob("*.geojson"))
|
|
print(f"Extracted {len(geojson_files)} GeoJSON files to {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|