perfect-postcode/pipeline/download/postcodes.py
Andras Schmelczer 6cc7288126
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled
All good
2026-05-18 21:20:10 +01:00

68 lines
2.2 KiB
Python

"""Download GB postcodes data from MapIt.
Downloads the gb-postcodes-v5.tar.bz2 archive containing UK postcode GeoJSON data
and extracts it to the specified output directory.
Source: https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2
"""
import argparse
import shutil
import tarfile
import tempfile
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
def extract_tar_bz2(archive_path: Path, extract_dir: Path) -> None:
"""Extract a tar.bz2 archive into the given directory."""
extract_dir.mkdir(parents=True, exist_ok=True)
print(f"Extracting {archive_path.name}...")
with tarfile.open(archive_path, "r:bz2") as tf:
tf.extractall(extract_dir, filter="data")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download GB postcodes data from MapIt"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output directory for extracted GeoJSON files",
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
cache = Path(cache_dir)
archive_path = cache / "gb-postcodes-v5.tar.bz2"
extract_dir = cache / "extracted"
download(URL, archive_path, timeout=600)
extract_tar_bz2(archive_path, extract_dir)
# Find the extracted directory (gb-postcodes-v5)
extracted_contents = list(extract_dir.iterdir())
if len(extracted_contents) == 1 and extracted_contents[0].is_dir():
source_dir = extracted_contents[0]
else:
source_dir = extract_dir
# Move to output directory
args.output.parent.mkdir(parents=True, exist_ok=True)
if args.output.exists():
shutil.rmtree(args.output)
shutil.move(str(source_dir), str(args.output))
# Count extracted files
geojson_files = list(args.output.rglob("*.geojson"))
print(f"Extracted {len(geojson_files)} GeoJSON files to {args.output}")
if __name__ == "__main__":
main()