perfect-postcode/pipeline/download/geolytix_retail_points.py
2026-05-06 23:13:58 +01:00

96 lines
2.8 KiB
Python

"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release."""
import argparse
import re
from pathlib import Path
from tempfile import TemporaryDirectory
from zipfile import ZipFile
import polars as pl
from pipeline.utils.download import download
GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy"
GEOLYTIX_RETAIL_POINTS_URL = (
"https://drive.usercontent.google.com/download"
f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t"
)
CSV_NAME_RE = re.compile(
r"^geolytix_retailpoints_v(?P<version>\d+)_(?P<release>\d{6})\.csv$"
)
REQUIRED_COLUMNS = {
"id",
"retailer",
"fascia",
"store_name",
"postcode",
"long_wgs",
"lat_wgs",
}
def select_latest_csv_name(names: list[str]) -> str:
"""Return the latest root-level retail points CSV from a ZIP namelist."""
candidates: list[tuple[str, int, str]] = []
for name in names:
path = Path(name)
if path.parent != Path("."):
continue
match = CSV_NAME_RE.match(path.name)
if not match:
continue
candidates.append((match.group("release"), int(match.group("version")), name))
if not candidates:
raise ValueError("No root-level GEOLYTIX retail points CSV found")
return max(candidates)[2]
def read_latest_csv(zip_path: Path) -> pl.DataFrame:
"""Read the latest root-level CSV from a GEOLYTIX ZIP file."""
with ZipFile(zip_path) as zip_file:
csv_name = select_latest_csv_name(zip_file.namelist())
with zip_file.open(csv_name) as csv_file:
df = pl.read_csv(csv_file, infer_schema_length=10_000)
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(
f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}"
)
return df
def download_geolytix_retail_points(output_path: Path) -> None:
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
zip_path = Path(tmp) / "geolytix_retail_points.zip"
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
df = read_latest_csv(zip_path)
df.write_parquet(output_path)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download GEOLYTIX Grocery Retail Points"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_geolytix_retail_points(args.output)
if __name__ == "__main__":
main()