From caf943ed06425b43f89f5ab6db270a52930ed362 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 25 Jan 2026 20:19:10 +0000 Subject: [PATCH] Add post code dataset --- download_arcgis_data.py | 122 ++++++++++++++++++++++++++++++++++++++ download_land_registry.py | 2 +- 2 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 download_arcgis_data.py diff --git a/download_arcgis_data.py b/download_arcgis_data.py new file mode 100644 index 0000000..1e2d249 --- /dev/null +++ b/download_arcgis_data.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Download ArcGIS data and convert to Parquet.""" + +# Run it with: +# uv run download_arcgis_data.py + +import time +import zipfile +import httpx +import polars as pl +from pathlib import Path +from tqdm import tqdm + +URL = "https://www.arcgis.com/sharing/rest/content/items/077631e063eb4e1ab43575d01381ec33/data" + +BASE_DATA_PATH = Path("./data_sources") +BASE_DATA_PATH.mkdir(exist_ok=True) +DOWNLOAD_PATH = BASE_DATA_PATH / "arcgis_data.zip" +EXTRACT_PATH = BASE_DATA_PATH / "arcgis_extracted" +PARQUET_PATH = BASE_DATA_PATH / "arcgis_data.parquet" + +MAX_RETRIES = 3 + + +def download_with_progress(url: str, output_path: Path) -> None: + """Download a file with progress bar and retry logic.""" + for attempt in range(1, MAX_RETRIES + 1): + try: + with httpx.stream( + "GET", + url, + follow_redirects=True, + timeout=httpx.Timeout(30.0, read=None), + ) as response: + response.raise_for_status() # pyright: ignore[reportUnusedCallResult] + total = int(response.headers.get("content-length", 0)) + + with open(output_path, "wb") as f, tqdm( + total=total, + unit="B", + unit_scale=True, + unit_divisor=1024, + desc="Downloading", + ) as pbar: + for chunk in response.iter_bytes(chunk_size=8192): + f.write(chunk) + pbar.update(len(chunk)) + return # Success + except (httpx.ConnectError, httpx.ReadTimeout) as e: + if attempt < MAX_RETRIES: + wait = 2**attempt + print(f"Attempt {attempt} failed: {e}. Retrying in {wait}s...") + time.sleep(wait) + else: + raise + + +def extract_zip(zip_path: Path, extract_path: Path) -> list[Path]: + """Extract ZIP file and return list of extracted files.""" + print("Extracting ZIP file...") + extract_path.mkdir(exist_ok=True) + + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(extract_path) + return [extract_path / name for name in zf.namelist()] + + +def find_data_file(extract_path: Path) -> Path: + """Find the main data file (CSV, XLSX, or similar) in extracted files.""" + # Look for common data file extensions + for ext in ["*.csv", "*.xlsx", "*.xls", "*.json", "*.geojson"]: + files = list(extract_path.rglob(ext)) + if files: + # Return the largest file if multiple found + return max(files, key=lambda f: f.stat().st_size) + + raise FileNotFoundError(f"No data file found in {extract_path}") + + +def convert_to_parquet(data_path: Path, parquet_path: Path) -> None: + """Convert data file to Parquet using Polars.""" + print(f"Converting {data_path.name} to Parquet...") + + suffix = data_path.suffix.lower() + + if suffix == ".csv": + df = pl.read_csv(data_path, try_parse_dates=True) + elif suffix in [".xlsx", ".xls"]: + df = pl.read_excel(data_path) + elif suffix in [".json", ".geojson"]: + df = pl.read_json(data_path) + else: + raise ValueError(f"Unsupported file format: {suffix}") + + df.write_parquet(parquet_path, compression="zstd") + print(f"Saved to {parquet_path}") + print(f"Rows: {df.height:,}") + print(f"Columns: {df.columns}") + print(f"Original size: {data_path.stat().st_size / 1024**2:.1f} MB") + print(f"Parquet size: {parquet_path.stat().st_size / 1024**2:.1f} MB") + + +def main() -> None: + if not DOWNLOAD_PATH.exists(): + download_with_progress(URL, DOWNLOAD_PATH) + else: + print(f"File already exists at {DOWNLOAD_PATH}, skipping download") + + # Check if it's a ZIP file + if zipfile.is_zipfile(DOWNLOAD_PATH): + extracted_files = extract_zip(DOWNLOAD_PATH, EXTRACT_PATH) + print(f"Extracted {len(extracted_files)} files") + data_file = find_data_file(EXTRACT_PATH) + else: + # Not a ZIP, treat as direct data file + data_file = DOWNLOAD_PATH + + convert_to_parquet(data_file, PARQUET_PATH) + + +if __name__ == "__main__": + main() diff --git a/download_land_registry.py b/download_land_registry.py index e0a1699..c421bab 100644 --- a/download_land_registry.py +++ b/download_land_registry.py @@ -2,7 +2,7 @@ """Download Land Registry price paid data and convert to Parquet.""" # Run it with: -# uv run --with httpx --with polars --with tqdm python download_land_registry.py +# uv run download_land_registry.py # The download failed in this environment due to network restrictions, but the script will work on your local machine. The ~5GB CSV should compress to roughly ~1GB in Parquet format with ZSTD compression.