More

2026-05-04 17:21:26 +01:00 · 2026-05-04 17:21:26 +01:00 · 05a1f316e1
commit 05a1f316e1
parent cd34ee693f
58 changed files with 3113 additions and 1277 deletions
--- a/pipeline/check_travel_times.py
+++ b/pipeline/check_travel_times.py
@ -221,7 +221,7 @@ def main() -> None:
            deleted = _delete_files(args.travel_times, bad_files)
            print(f"Deleted {deleted}/{len(bad_files)} files.")
        else:
-            print(f"\nRun with --delete to remove these files so R5 can recompute them.")
+            print("\nRun with --delete to remove these files so R5 can recompute them.")
    else:
        print("\nNo corrupted files found.")

--- a/pipeline/download/geolytix_retail_points.py
+++ b/pipeline/download/geolytix_retail_points.py
@ -0,0 +1,98 @@
+"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release."""
+
+import argparse
+import re
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from zipfile import ZipFile
+
+import polars as pl
+
+from pipeline.utils.download import download
+
+
+GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy"
+GEOLYTIX_RETAIL_POINTS_URL = (
+    "https://drive.usercontent.google.com/download"
+    f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t"
+)
+
+CSV_NAME_RE = re.compile(
+    r"^geolytix_retailpoints_v(?P<version>\d+)_(?P<release>\d{6})\.csv$"
+)
+
+REQUIRED_COLUMNS = {
+    "id",
+    "retailer",
+    "fascia",
+    "store_name",
+    "postcode",
+    "long_wgs",
+    "lat_wgs",
+}
+
+
+def select_latest_csv_name(names: list[str]) -> str:
+    """Return the latest root-level retail points CSV from a ZIP namelist."""
+    candidates: list[tuple[str, int, str]] = []
+    for name in names:
+        path = Path(name)
+        if path.parent != Path("."):
+            continue
+        match = CSV_NAME_RE.match(path.name)
+        if not match:
+            continue
+        candidates.append(
+            (match.group("release"), int(match.group("version")), name)
+        )
+
+    if not candidates:
+        raise ValueError("No root-level GEOLYTIX retail points CSV found")
+
+    return max(candidates)[2]
+
+
+def read_latest_csv(zip_path: Path) -> pl.DataFrame:
+    """Read the latest root-level CSV from a GEOLYTIX ZIP file."""
+    with ZipFile(zip_path) as zip_file:
+        csv_name = select_latest_csv_name(zip_file.namelist())
+        with zip_file.open(csv_name) as csv_file:
+            df = pl.read_csv(csv_file, infer_schema_length=10_000)
+
+    missing = REQUIRED_COLUMNS - set(df.columns)
+    if missing:
+        raise ValueError(
+            f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}"
+        )
+
+    return df
+
+
+def download_geolytix_retail_points(output_path: Path) -> None:
+    """Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
+        zip_path = Path(tmp) / "geolytix_retail_points.zip"
+        download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
+        df = read_latest_csv(zip_path)
+
+    df.write_parquet(output_path)
+    size_mb = output_path.stat().st_size / (1024 * 1024)
+    print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download GEOLYTIX Grocery Retail Points"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+
+    download_geolytix_retail_points(args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/test_geolytix_retail_points.py
+++ b/pipeline/download/test_geolytix_retail_points.py
@ -0,0 +1,41 @@
+from zipfile import ZipFile
+
+import polars as pl
+
+from pipeline.download.geolytix_retail_points import (
+    read_latest_csv,
+    select_latest_csv_name,
+)
+
+
+def test_select_latest_csv_ignores_previous_versions():
+    names = [
+        "README.txt",
+        "geolytix_retailpoints_v41_202602.csv",
+        "geolytix_retailpoints_v43_202603.csv",
+        "Previous Versions/geolytix_retailpoints_v99_209901.csv",
+    ]
+
+    assert select_latest_csv_name(names) == "geolytix_retailpoints_v43_202603.csv"
+
+
+def test_read_latest_csv_validates_required_columns(tmp_path):
+    zip_path = tmp_path / "retail_points.zip"
+    df = pl.DataFrame(
+        {
+            "id": [1],
+            "retailer": ["Waitrose"],
+            "fascia": ["Waitrose"],
+            "store_name": ["Waitrose Test"],
+            "postcode": ["SW1A 1AA"],
+            "long_wgs": [-0.1],
+            "lat_wgs": [51.5],
+        }
+    )
+
+    with ZipFile(zip_path, "w") as zip_file:
+        zip_file.writestr("geolytix_retailpoints_v1_202401.csv", "not,the,latest\n")
+        with zip_file.open("geolytix_retailpoints_v2_202402.csv", "w") as csv_file:
+            df.write_csv(csv_file)
+
+    assert read_latest_csv(zip_path).to_dicts() == df.to_dicts()
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -0,0 +1,59 @@
+import polars as pl
+
+from pipeline.transform.transform_poi import transform_grocery_retail_points
+
+
+def test_transform_grocery_retail_points_outputs_chain_categories():
+    raw = pl.DataFrame(
+        {
+            "id": [101, 102, 103],
+            "retailer": ["Waitrose", "Sainsburys", "The Co-operative Group"],
+            "fascia": ["Waitrose", "Sainsbury's Local", "Co-op Food"],
+            "store_name": ["Waitrose Test", "Sainsbury''s Test", "Co-op Test"],
+            "long_wgs": [-0.141, -0.142, -0.143],
+            "lat_wgs": [51.515, 51.516, 51.517],
+        }
+    )
+
+    pois = transform_grocery_retail_points(raw)
+
+    assert pois.select("id", "name", "category", "group", "emoji").to_dicts() == [
+        {
+            "id": "glx-101",
+            "name": "Waitrose Test",
+            "category": "Waitrose",
+            "group": "Groceries",
+            "emoji": "🛒",
+        },
+        {
+            "id": "glx-102",
+            "name": "Sainsbury's Test",
+            "category": "Sainsbury's",
+            "group": "Groceries",
+            "emoji": "🛒",
+        },
+        {
+            "id": "glx-103",
+            "name": "Co-op Test",
+            "category": "Co-op",
+            "group": "Groceries",
+            "emoji": "🛒",
+        },
+    ]
+
+
+def test_transform_grocery_retail_points_drops_invalid_rows():
+    raw = pl.DataFrame(
+        {
+            "id": [101, 102],
+            "retailer": ["Waitrose", ""],
+            "fascia": ["Waitrose", "Tesco"],
+            "store_name": ["Waitrose Test", "Tesco Test"],
+            "long_wgs": [-0.141, -0.142],
+            "lat_wgs": [51.515, 51.516],
+        }
+    )
+
+    pois = transform_grocery_retail_points(raw)
+
+    assert pois["category"].to_list() == ["Waitrose"]
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1058,10 +1058,91 @@ NAPTAN_EMOJIS: dict[str, str] = {
 }


+COOP_RETAILERS = {
+    "Allendale Co-operative Society",
+    "Central England Co-operative",
+    "Channel Islands Co-operative Society",
+    "Chelmsford Star Co-operative Society",
+    "Clydebank Co-operative",
+    "Coniston Co-operative Society",
+    "East of England Co-operative",
+    "Heart of England Co-operative",
+    "Langdale Co-operative Society",
+    "Lincolnshire Co-operative",
+    "Midcounties Co-operative",
+    "Scottish Midland Co-operative",
+    "Tamworth Co-operative Society",
+    "The Co-operative Group",
+    "The Radstock Co-operative Society",
+    "The Southern Co-operative",
+}
+
+GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
+    "Cook": "COOK",
+    "Heron": "Heron Foods",
+    "Marks and Spencer": "M&S",
+    "Sainsburys": "Sainsbury's",
+    **{retailer: "Co-op" for retailer in COOP_RETAILERS},
+}
+
+
+def normalize_grocery_retailer(retailer: str | None) -> str:
+    if retailer is None:
+        return ""
+    return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
+
+
+def transform_grocery_retail_points(
+    grocery_df: pl.DataFrame,
+    boundary_path: Path | None = None,
+) -> pl.DataFrame:
+    """Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
+    required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
+    missing = required - set(grocery_df.columns)
+    if missing:
+        raise ValueError(
+            f"GEOLYTIX retail points missing columns: {sorted(missing)}"
+        )
+
+    df = (
+        grocery_df.select(
+            pl.col("id").cast(pl.String),
+            pl.col("retailer").cast(pl.String),
+            pl.col("fascia").cast(pl.String),
+            pl.col("store_name").cast(pl.String),
+            pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
+            pl.col("long_wgs").cast(pl.Float64).alias("lng"),
+        )
+        .drop_nulls(["id", "retailer", "lat", "lng"])
+        .filter(pl.col("retailer").str.len_chars() > 0)
+    )
+
+    if boundary_path is not None and len(df) > 0:
+        mask = in_england_mask(
+            boundary_path,
+            df["lat"].to_numpy(),
+            df["lng"].to_numpy(),
+        )
+        df = df.filter(pl.Series(mask))
+
+    return df.with_columns(
+        pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
+        pl.coalesce(["store_name", "fascia", "retailer"])
+        .str.replace_all("''", "'")
+        .alias("name"),
+        pl.col("retailer")
+        .map_elements(normalize_grocery_retailer, return_dtype=pl.String)
+        .alias("category"),
+        pl.lit("Groceries").alias("group"),
+        pl.lit("🛒").alias("emoji"),
+    ).select("id", "name", "category", "group", "lat", "lng", "emoji")
+
+
 def transform(
    input_path: Path,
    naptan_path: Path | None = None,
    boundary_path: Path | None = None,
+    grocery_retail_points_path: Path | None = None,
 ) -> pl.LazyFrame:
    lf = pl.scan_parquet(input_path)

@ -1123,7 +1204,14 @@ def transform(
        pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
        pl.lit("Public Transport").alias("group"),
    )
-    return pl.concat([lf, naptan], how="diagonal_relaxed")
+
+    frames = [lf, naptan]
+    if grocery_retail_points_path is not None:
+        grocery_df = pl.read_parquet(grocery_retail_points_path)
+        grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
+        frames.append(grocery_pois.lazy())
+
+    return pl.concat(frames, how="diagonal_relaxed")


 def main():
@ -1142,12 +1230,22 @@ def main():
        required=True,
        help="England boundary GeoJSON file",
    )
+    parser.add_argument(
+        "--grocery-retail-points",
+        type=Path,
+        help="GEOLYTIX Grocery Retail Points parquet",
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
    )
    args = parser.parse_args()

-    df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
+    df = transform(
+        args.input,
+        args.naptan,
+        args.boundary,
+        args.grocery_retail_points,
+    ).collect(engine="streaming")

    df.write_parquet(args.output)