96 lines
2.8 KiB
Python
96 lines
2.8 KiB
Python
"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release."""
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
from tempfile import TemporaryDirectory
|
|
from zipfile import ZipFile
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.utils.download import download
|
|
|
|
|
|
GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy"
|
|
GEOLYTIX_RETAIL_POINTS_URL = (
|
|
"https://drive.usercontent.google.com/download"
|
|
f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t"
|
|
)
|
|
|
|
CSV_NAME_RE = re.compile(
|
|
r"^geolytix_retailpoints_v(?P<version>\d+)_(?P<release>\d{6})\.csv$"
|
|
)
|
|
|
|
REQUIRED_COLUMNS = {
|
|
"id",
|
|
"retailer",
|
|
"fascia",
|
|
"store_name",
|
|
"postcode",
|
|
"long_wgs",
|
|
"lat_wgs",
|
|
}
|
|
|
|
|
|
def select_latest_csv_name(names: list[str]) -> str:
|
|
"""Return the latest root-level retail points CSV from a ZIP namelist."""
|
|
candidates: list[tuple[str, int, str]] = []
|
|
for name in names:
|
|
path = Path(name)
|
|
if path.parent != Path("."):
|
|
continue
|
|
match = CSV_NAME_RE.match(path.name)
|
|
if not match:
|
|
continue
|
|
candidates.append((match.group("release"), int(match.group("version")), name))
|
|
|
|
if not candidates:
|
|
raise ValueError("No root-level GEOLYTIX retail points CSV found")
|
|
|
|
return max(candidates)[2]
|
|
|
|
|
|
def read_latest_csv(zip_path: Path) -> pl.DataFrame:
|
|
"""Read the latest root-level CSV from a GEOLYTIX ZIP file."""
|
|
with ZipFile(zip_path) as zip_file:
|
|
csv_name = select_latest_csv_name(zip_file.namelist())
|
|
with zip_file.open(csv_name) as csv_file:
|
|
df = pl.read_csv(csv_file, infer_schema_length=10_000)
|
|
|
|
missing = REQUIRED_COLUMNS - set(df.columns)
|
|
if missing:
|
|
raise ValueError(
|
|
f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}"
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
def download_geolytix_retail_points(output_path: Path) -> None:
|
|
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
|
|
zip_path = Path(tmp) / "geolytix_retail_points.zip"
|
|
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
|
|
df = read_latest_csv(zip_path)
|
|
|
|
df.write_parquet(output_path)
|
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download GEOLYTIX Grocery Retail Points"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
download_geolytix_retail_points(args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|