More
Some checks failed
CI / Check (push) Failing after 2m14s
Build and publish Docker image / build-and-push (push) Failing after 2m38s

This commit is contained in:
Andras Schmelczer 2026-05-04 17:21:26 +01:00
parent cd34ee693f
commit 05a1f316e1
58 changed files with 3113 additions and 1277 deletions

View file

@ -221,7 +221,7 @@ def main() -> None:
deleted = _delete_files(args.travel_times, bad_files)
print(f"Deleted {deleted}/{len(bad_files)} files.")
else:
print(f"\nRun with --delete to remove these files so R5 can recompute them.")
print("\nRun with --delete to remove these files so R5 can recompute them.")
else:
print("\nNo corrupted files found.")

View file

@ -0,0 +1,98 @@
"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release."""
import argparse
import re
from pathlib import Path
from tempfile import TemporaryDirectory
from zipfile import ZipFile
import polars as pl
from pipeline.utils.download import download
GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy"
GEOLYTIX_RETAIL_POINTS_URL = (
"https://drive.usercontent.google.com/download"
f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t"
)
CSV_NAME_RE = re.compile(
r"^geolytix_retailpoints_v(?P<version>\d+)_(?P<release>\d{6})\.csv$"
)
REQUIRED_COLUMNS = {
"id",
"retailer",
"fascia",
"store_name",
"postcode",
"long_wgs",
"lat_wgs",
}
def select_latest_csv_name(names: list[str]) -> str:
"""Return the latest root-level retail points CSV from a ZIP namelist."""
candidates: list[tuple[str, int, str]] = []
for name in names:
path = Path(name)
if path.parent != Path("."):
continue
match = CSV_NAME_RE.match(path.name)
if not match:
continue
candidates.append(
(match.group("release"), int(match.group("version")), name)
)
if not candidates:
raise ValueError("No root-level GEOLYTIX retail points CSV found")
return max(candidates)[2]
def read_latest_csv(zip_path: Path) -> pl.DataFrame:
"""Read the latest root-level CSV from a GEOLYTIX ZIP file."""
with ZipFile(zip_path) as zip_file:
csv_name = select_latest_csv_name(zip_file.namelist())
with zip_file.open(csv_name) as csv_file:
df = pl.read_csv(csv_file, infer_schema_length=10_000)
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(
f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}"
)
return df
def download_geolytix_retail_points(output_path: Path) -> None:
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
zip_path = Path(tmp) / "geolytix_retail_points.zip"
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
df = read_latest_csv(zip_path)
df.write_parquet(output_path)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download GEOLYTIX Grocery Retail Points"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_geolytix_retail_points(args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,41 @@
from zipfile import ZipFile
import polars as pl
from pipeline.download.geolytix_retail_points import (
read_latest_csv,
select_latest_csv_name,
)
def test_select_latest_csv_ignores_previous_versions():
names = [
"README.txt",
"geolytix_retailpoints_v41_202602.csv",
"geolytix_retailpoints_v43_202603.csv",
"Previous Versions/geolytix_retailpoints_v99_209901.csv",
]
assert select_latest_csv_name(names) == "geolytix_retailpoints_v43_202603.csv"
def test_read_latest_csv_validates_required_columns(tmp_path):
zip_path = tmp_path / "retail_points.zip"
df = pl.DataFrame(
{
"id": [1],
"retailer": ["Waitrose"],
"fascia": ["Waitrose"],
"store_name": ["Waitrose Test"],
"postcode": ["SW1A 1AA"],
"long_wgs": [-0.1],
"lat_wgs": [51.5],
}
)
with ZipFile(zip_path, "w") as zip_file:
zip_file.writestr("geolytix_retailpoints_v1_202401.csv", "not,the,latest\n")
with zip_file.open("geolytix_retailpoints_v2_202402.csv", "w") as csv_file:
df.write_csv(csv_file)
assert read_latest_csv(zip_path).to_dicts() == df.to_dicts()

View file

@ -0,0 +1,59 @@
import polars as pl
from pipeline.transform.transform_poi import transform_grocery_retail_points
def test_transform_grocery_retail_points_outputs_chain_categories():
raw = pl.DataFrame(
{
"id": [101, 102, 103],
"retailer": ["Waitrose", "Sainsburys", "The Co-operative Group"],
"fascia": ["Waitrose", "Sainsbury's Local", "Co-op Food"],
"store_name": ["Waitrose Test", "Sainsbury''s Test", "Co-op Test"],
"long_wgs": [-0.141, -0.142, -0.143],
"lat_wgs": [51.515, 51.516, 51.517],
}
)
pois = transform_grocery_retail_points(raw)
assert pois.select("id", "name", "category", "group", "emoji").to_dicts() == [
{
"id": "glx-101",
"name": "Waitrose Test",
"category": "Waitrose",
"group": "Groceries",
"emoji": "🛒",
},
{
"id": "glx-102",
"name": "Sainsbury's Test",
"category": "Sainsbury's",
"group": "Groceries",
"emoji": "🛒",
},
{
"id": "glx-103",
"name": "Co-op Test",
"category": "Co-op",
"group": "Groceries",
"emoji": "🛒",
},
]
def test_transform_grocery_retail_points_drops_invalid_rows():
raw = pl.DataFrame(
{
"id": [101, 102],
"retailer": ["Waitrose", ""],
"fascia": ["Waitrose", "Tesco"],
"store_name": ["Waitrose Test", "Tesco Test"],
"long_wgs": [-0.141, -0.142],
"lat_wgs": [51.515, 51.516],
}
)
pois = transform_grocery_retail_points(raw)
assert pois["category"].to_list() == ["Waitrose"]

View file

@ -1058,10 +1058,91 @@ NAPTAN_EMOJIS: dict[str, str] = {
}
COOP_RETAILERS = {
"Allendale Co-operative Society",
"Central England Co-operative",
"Channel Islands Co-operative Society",
"Chelmsford Star Co-operative Society",
"Clydebank Co-operative",
"Coniston Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Langdale Co-operative Society",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Co-operative Group",
"The Radstock Co-operative Society",
"The Southern Co-operative",
}
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
"Cook": "COOK",
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
}
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
missing = required - set(grocery_df.columns)
if missing:
raise ValueError(
f"GEOLYTIX retail points missing columns: {sorted(missing)}"
)
df = (
grocery_df.select(
pl.col("id").cast(pl.String),
pl.col("retailer").cast(pl.String),
pl.col("fascia").cast(pl.String),
pl.col("store_name").cast(pl.String),
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
if boundary_path is not None and len(df) > 0:
mask = in_england_mask(
boundary_path,
df["lat"].to_numpy(),
df["lng"].to_numpy(),
)
df = df.filter(pl.Series(mask))
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category"),
pl.lit("Groceries").alias("group"),
pl.lit("🛒").alias("emoji"),
).select("id", "name", "category", "group", "lat", "lng", "emoji")
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
grocery_retail_points_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1123,7 +1204,14 @@ def transform(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
return pl.concat([lf, naptan], how="diagonal_relaxed")
frames = [lf, naptan]
if grocery_retail_points_path is not None:
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames.append(grocery_pois.lazy())
return pl.concat(frames, how="diagonal_relaxed")
def main():
@ -1142,12 +1230,22 @@ def main():
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--grocery-retail-points",
type=Path,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df = transform(
args.input,
args.naptan,
args.boundary,
args.grocery_retail_points,
).collect(engine="streaming")
df.write_parquet(args.output)