More
This commit is contained in:
parent
cd34ee693f
commit
05a1f316e1
58 changed files with 3113 additions and 1277 deletions
|
|
@ -221,7 +221,7 @@ def main() -> None:
|
|||
deleted = _delete_files(args.travel_times, bad_files)
|
||||
print(f"Deleted {deleted}/{len(bad_files)} files.")
|
||||
else:
|
||||
print(f"\nRun with --delete to remove these files so R5 can recompute them.")
|
||||
print("\nRun with --delete to remove these files so R5 can recompute them.")
|
||||
else:
|
||||
print("\nNo corrupted files found.")
|
||||
|
||||
|
|
|
|||
98
pipeline/download/geolytix_retail_points.py
Normal file
98
pipeline/download/geolytix_retail_points.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
"""Download GEOLYTIX Grocery Retail Points and keep the latest CSV release."""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from zipfile import ZipFile
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.download import download
|
||||
|
||||
|
||||
GEOLYTIX_RETAIL_POINTS_FILE_ID = "1B8M7m86rQg2sx2TsHhFa2d-x-dZ1DbSy"
|
||||
GEOLYTIX_RETAIL_POINTS_URL = (
|
||||
"https://drive.usercontent.google.com/download"
|
||||
f"?id={GEOLYTIX_RETAIL_POINTS_FILE_ID}&export=download&confirm=t"
|
||||
)
|
||||
|
||||
CSV_NAME_RE = re.compile(
|
||||
r"^geolytix_retailpoints_v(?P<version>\d+)_(?P<release>\d{6})\.csv$"
|
||||
)
|
||||
|
||||
REQUIRED_COLUMNS = {
|
||||
"id",
|
||||
"retailer",
|
||||
"fascia",
|
||||
"store_name",
|
||||
"postcode",
|
||||
"long_wgs",
|
||||
"lat_wgs",
|
||||
}
|
||||
|
||||
|
||||
def select_latest_csv_name(names: list[str]) -> str:
|
||||
"""Return the latest root-level retail points CSV from a ZIP namelist."""
|
||||
candidates: list[tuple[str, int, str]] = []
|
||||
for name in names:
|
||||
path = Path(name)
|
||||
if path.parent != Path("."):
|
||||
continue
|
||||
match = CSV_NAME_RE.match(path.name)
|
||||
if not match:
|
||||
continue
|
||||
candidates.append(
|
||||
(match.group("release"), int(match.group("version")), name)
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
raise ValueError("No root-level GEOLYTIX retail points CSV found")
|
||||
|
||||
return max(candidates)[2]
|
||||
|
||||
|
||||
def read_latest_csv(zip_path: Path) -> pl.DataFrame:
|
||||
"""Read the latest root-level CSV from a GEOLYTIX ZIP file."""
|
||||
with ZipFile(zip_path) as zip_file:
|
||||
csv_name = select_latest_csv_name(zip_file.namelist())
|
||||
with zip_file.open(csv_name) as csv_file:
|
||||
df = pl.read_csv(csv_file, infer_schema_length=10_000)
|
||||
|
||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"GEOLYTIX retail points CSV is missing columns: {sorted(missing)}"
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def download_geolytix_retail_points(output_path: Path) -> None:
|
||||
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
|
||||
zip_path = Path(tmp) / "geolytix_retail_points.zip"
|
||||
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
|
||||
df = read_latest_csv(zip_path)
|
||||
|
||||
df.write_parquet(output_path)
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {output_path} ({size_mb:.1f} MB, {len(df):,} stores)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download GEOLYTIX Grocery Retail Points"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
download_geolytix_retail_points(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
41
pipeline/download/test_geolytix_retail_points.py
Normal file
41
pipeline/download/test_geolytix_retail_points.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
from zipfile import ZipFile
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.download.geolytix_retail_points import (
|
||||
read_latest_csv,
|
||||
select_latest_csv_name,
|
||||
)
|
||||
|
||||
|
||||
def test_select_latest_csv_ignores_previous_versions():
|
||||
names = [
|
||||
"README.txt",
|
||||
"geolytix_retailpoints_v41_202602.csv",
|
||||
"geolytix_retailpoints_v43_202603.csv",
|
||||
"Previous Versions/geolytix_retailpoints_v99_209901.csv",
|
||||
]
|
||||
|
||||
assert select_latest_csv_name(names) == "geolytix_retailpoints_v43_202603.csv"
|
||||
|
||||
|
||||
def test_read_latest_csv_validates_required_columns(tmp_path):
|
||||
zip_path = tmp_path / "retail_points.zip"
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [1],
|
||||
"retailer": ["Waitrose"],
|
||||
"fascia": ["Waitrose"],
|
||||
"store_name": ["Waitrose Test"],
|
||||
"postcode": ["SW1A 1AA"],
|
||||
"long_wgs": [-0.1],
|
||||
"lat_wgs": [51.5],
|
||||
}
|
||||
)
|
||||
|
||||
with ZipFile(zip_path, "w") as zip_file:
|
||||
zip_file.writestr("geolytix_retailpoints_v1_202401.csv", "not,the,latest\n")
|
||||
with zip_file.open("geolytix_retailpoints_v2_202402.csv", "w") as csv_file:
|
||||
df.write_csv(csv_file)
|
||||
|
||||
assert read_latest_csv(zip_path).to_dicts() == df.to_dicts()
|
||||
59
pipeline/transform/test_transform_poi.py
Normal file
59
pipeline/transform/test_transform_poi.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.transform_poi import transform_grocery_retail_points
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [101, 102, 103],
|
||||
"retailer": ["Waitrose", "Sainsburys", "The Co-operative Group"],
|
||||
"fascia": ["Waitrose", "Sainsbury's Local", "Co-op Food"],
|
||||
"store_name": ["Waitrose Test", "Sainsbury''s Test", "Co-op Test"],
|
||||
"long_wgs": [-0.141, -0.142, -0.143],
|
||||
"lat_wgs": [51.515, 51.516, 51.517],
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois.select("id", "name", "category", "group", "emoji").to_dicts() == [
|
||||
{
|
||||
"id": "glx-101",
|
||||
"name": "Waitrose Test",
|
||||
"category": "Waitrose",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
{
|
||||
"id": "glx-102",
|
||||
"name": "Sainsbury's Test",
|
||||
"category": "Sainsbury's",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
{
|
||||
"id": "glx-103",
|
||||
"name": "Co-op Test",
|
||||
"category": "Co-op",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_drops_invalid_rows():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [101, 102],
|
||||
"retailer": ["Waitrose", ""],
|
||||
"fascia": ["Waitrose", "Tesco"],
|
||||
"store_name": ["Waitrose Test", "Tesco Test"],
|
||||
"long_wgs": [-0.141, -0.142],
|
||||
"lat_wgs": [51.515, 51.516],
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois["category"].to_list() == ["Waitrose"]
|
||||
|
|
@ -1058,10 +1058,91 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
COOP_RETAILERS = {
|
||||
"Allendale Co-operative Society",
|
||||
"Central England Co-operative",
|
||||
"Channel Islands Co-operative Society",
|
||||
"Chelmsford Star Co-operative Society",
|
||||
"Clydebank Co-operative",
|
||||
"Coniston Co-operative Society",
|
||||
"East of England Co-operative",
|
||||
"Heart of England Co-operative",
|
||||
"Langdale Co-operative Society",
|
||||
"Lincolnshire Co-operative",
|
||||
"Midcounties Co-operative",
|
||||
"Scottish Midland Co-operative",
|
||||
"Tamworth Co-operative Society",
|
||||
"The Co-operative Group",
|
||||
"The Radstock Co-operative Society",
|
||||
"The Southern Co-operative",
|
||||
}
|
||||
|
||||
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
|
||||
"Cook": "COOK",
|
||||
"Heron": "Heron Foods",
|
||||
"Marks and Spencer": "M&S",
|
||||
"Sainsburys": "Sainsbury's",
|
||||
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
|
||||
}
|
||||
|
||||
|
||||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||
if retailer is None:
|
||||
return ""
|
||||
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
|
||||
|
||||
|
||||
def transform_grocery_retail_points(
|
||||
grocery_df: pl.DataFrame,
|
||||
boundary_path: Path | None = None,
|
||||
) -> pl.DataFrame:
|
||||
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
|
||||
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
|
||||
missing = required - set(grocery_df.columns)
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"GEOLYTIX retail points missing columns: {sorted(missing)}"
|
||||
)
|
||||
|
||||
df = (
|
||||
grocery_df.select(
|
||||
pl.col("id").cast(pl.String),
|
||||
pl.col("retailer").cast(pl.String),
|
||||
pl.col("fascia").cast(pl.String),
|
||||
pl.col("store_name").cast(pl.String),
|
||||
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
|
||||
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
|
||||
)
|
||||
.drop_nulls(["id", "retailer", "lat", "lng"])
|
||||
.filter(pl.col("retailer").str.len_chars() > 0)
|
||||
)
|
||||
|
||||
if boundary_path is not None and len(df) > 0:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
df["lat"].to_numpy(),
|
||||
df["lng"].to_numpy(),
|
||||
)
|
||||
df = df.filter(pl.Series(mask))
|
||||
|
||||
return df.with_columns(
|
||||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||||
pl.coalesce(["store_name", "fascia", "retailer"])
|
||||
.str.replace_all("''", "'")
|
||||
.alias("name"),
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category"),
|
||||
pl.lit("Groceries").alias("group"),
|
||||
pl.lit("🛒").alias("emoji"),
|
||||
).select("id", "name", "category", "group", "lat", "lng", "emoji")
|
||||
|
||||
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
grocery_retail_points_path: Path | None = None,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
|
|
@ -1123,7 +1204,14 @@ def transform(
|
|||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
)
|
||||
return pl.concat([lf, naptan], how="diagonal_relaxed")
|
||||
|
||||
frames = [lf, naptan]
|
||||
if grocery_retail_points_path is not None:
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
frames.append(grocery_pois.lazy())
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -1142,12 +1230,22 @@ def main():
|
|||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--grocery-retail-points",
|
||||
type=Path,
|
||||
help="GEOLYTIX Grocery Retail Points parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||
df = transform(
|
||||
args.input,
|
||||
args.naptan,
|
||||
args.boundary,
|
||||
args.grocery_retail_points,
|
||||
).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue