"""Download GB postcodes data from MapIt. Downloads the gb-postcodes-v5.tar.bz2 archive containing UK postcode GeoJSON data and extracts it to the specified output directory. Source: https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2 """ import argparse import shutil import tarfile import tempfile from pathlib import Path from pipeline.utils import download URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2" def extract_tar_bz2(archive_path: Path, extract_dir: Path) -> None: """Extract a tar.bz2 archive into the given directory.""" extract_dir.mkdir(parents=True, exist_ok=True) print(f"Extracting {archive_path.name}...") with tarfile.open(archive_path, "r:bz2") as tf: tf.extractall(extract_dir, filter="data") def main() -> None: parser = argparse.ArgumentParser( description="Download GB postcodes data from MapIt" ) parser.add_argument( "--output", type=Path, required=True, help="Output directory for extracted GeoJSON files", ) args = parser.parse_args() with tempfile.TemporaryDirectory() as cache_dir: cache = Path(cache_dir) archive_path = cache / "gb-postcodes-v5.tar.bz2" extract_dir = cache / "extracted" download(URL, archive_path, timeout=600) extract_tar_bz2(archive_path, extract_dir) # Find the extracted directory (gb-postcodes-v5) extracted_contents = list(extract_dir.iterdir()) if len(extracted_contents) == 1 and extracted_contents[0].is_dir(): source_dir = extracted_contents[0] else: source_dir = extract_dir # Move to output directory args.output.parent.mkdir(parents=True, exist_ok=True) if args.output.exists(): shutil.rmtree(args.output) shutil.move(str(source_dir), str(args.output)) # Count extracted files geojson_files = list(args.output.rglob("*.geojson")) print(f"Extracted {len(geojson_files)} GeoJSON files to {args.output}") if __name__ == "__main__": main()