"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release.""" import argparse import re from pathlib import Path from tempfile import TemporaryDirectory from zipfile import ZipFile import polars as pl from pipeline.utils.download import download GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy" GEOLYTIX_RETAIL_POINTS_URL = ( "https://drive.usercontent.google.com/download" f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t" ) CSV_NAME_RE = re.compile( r"^geolytix_retailpoints_v(?P\d+)_(?P\d{6})\.csv$" ) REQUIRED_COLUMNS = { "id", "retailer", "fascia", "store_name", "postcode", "long_wgs", "lat_wgs", } def select_latest_csv_name(names: list[str]) -> str: """Return the latest root-level retail points CSV from a ZIP namelist.""" candidates: list[tuple[str, int, str]] = [] for name in names: path = Path(name) if path.parent != Path("."): continue match = CSV_NAME_RE.match(path.name) if not match: continue candidates.append((match.group("release"), int(match.group("version")), name)) if not candidates: raise ValueError("No root-level GEOLYTIX retail points CSV found") return max(candidates)[2] def read_latest_csv(zip_path: Path) -> pl.DataFrame: """Read the latest root-level CSV from a GEOLYTIX ZIP file.""" with ZipFile(zip_path) as zip_file: csv_name = select_latest_csv_name(zip_file.namelist()) with zip_file.open(csv_name) as csv_file: df = pl.read_csv(csv_file, infer_schema_length=10_000) missing = REQUIRED_COLUMNS - set(df.columns) if missing: raise ValueError( f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}" ) return df def download_geolytix_retail_points(output_path: Path) -> None: """Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet.""" output_path.parent.mkdir(parents=True, exist_ok=True) with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp: zip_path = Path(tmp) / "geolytix_retail_points.zip" download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300) df = read_latest_csv(zip_path) df.write_parquet(output_path) size_mb = output_path.stat().st_size / (1024 * 1024) print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)") def main() -> None: parser = argparse.ArgumentParser( description="Download GEOLYTIX Grocery Retail Points" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_geolytix_retail_points(args.output) if __name__ == "__main__": main()