Download postcode geojson
This commit is contained in:
parent
ae29662c92
commit
6268dbda4d
1 changed files with 67 additions and 0 deletions
67
pipeline/download/postcodes.py
Normal file
67
pipeline/download/postcodes.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
"""Download GB postcodes data from MapIt.
|
||||
|
||||
Downloads the gb-postcodes-v5.tar.bz2 archive containing UK postcode GeoJSON data
|
||||
and extracts it to the specified output directory.
|
||||
|
||||
Source: https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import tarfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.utils import download
|
||||
|
||||
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
|
||||
|
||||
|
||||
def extract_tar_bz2(archive_path: Path, extract_dir: Path) -> None:
|
||||
"""Extract a tar.bz2 archive into the given directory."""
|
||||
extract_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Extracting {archive_path.name}...")
|
||||
with tarfile.open(archive_path, "r:bz2") as tf:
|
||||
tf.extractall(extract_dir, filter="data")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download GB postcodes data from MapIt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for extracted GeoJSON files",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
cache = Path(cache_dir)
|
||||
archive_path = cache / "gb-postcodes-v5.tar.bz2"
|
||||
extract_dir = cache / "extracted"
|
||||
|
||||
download(URL, archive_path, timeout=600)
|
||||
extract_tar_bz2(archive_path, extract_dir)
|
||||
|
||||
# Find the extracted directory (gb-postcodes-v5)
|
||||
extracted_contents = list(extract_dir.iterdir())
|
||||
if len(extracted_contents) == 1 and extracted_contents[0].is_dir():
|
||||
source_dir = extracted_contents[0]
|
||||
else:
|
||||
source_dir = extract_dir
|
||||
|
||||
# Move to output directory
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
if args.output.exists():
|
||||
shutil.rmtree(args.output)
|
||||
shutil.move(str(source_dir), str(args.output))
|
||||
|
||||
# Count extracted files
|
||||
geojson_files = list(args.output.rglob("*.geojson"))
|
||||
print(f"Extracted {len(geojson_files)} GeoJSON files to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue