all is well
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 7m0s
CI / Check (push) Failing after 7m9s

This commit is contained in:
Andras Schmelczer 2026-05-17 17:20:19 +01:00
parent eac1bd0d13
commit 2f149503bb
53 changed files with 1543 additions and 354 deletions

View file

@ -5,7 +5,6 @@ import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
"amenity/advice",
@ -1165,49 +1164,44 @@ COOP_RETAILERS = {
"The Southern Co-operative",
}
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
"Aldi": "Aldi",
"Asda": "Asda",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
MIN_GROCERY_CHAIN_LOCATIONS = 5
GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Morrisons": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsburys": "Sainsbury's",
"Spar": "Spar",
"Tesco": "Tesco",
"Waitrose": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
"The Co-operative Group": "Co-op",
}
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
**GROCERY_RETAILER_DISPLAY_NAMES,
"Aldi": "Aldi",
"Aldi Local": "Aldi",
"Asda": "Asda",
"Asda Express": "Asda Express",
"Asda Living": "Asda Living",
"Asda PFS": "Asda PFS",
"Asda PFS": "Asda",
"Asda Supercentre": "Asda Supercentre",
"Asda Supermarket": "Asda Supermarket",
"Asda Superstore": "Asda Superstore",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
"Cooltrader": "Heron Foods",
"Co-op Food": "Co-op",
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Eurospar": "Spar",
"Eurospar PFS": "Spar",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Little Waitrose": "Little Waitrose",
"Little Waitrose Shell": "Little Waitrose",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Marks and Spencer BP": "M&S Food",
"Marks and Spencer Clothing": "M&S Clothing",
@ -1221,41 +1215,44 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
"Marks and Spencer Travel SF": "M&S Food",
"Morrisons Daily": "Morrisons Daily",
"Morrisons Select": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsbury's Local": "Sainsbury's Local",
"Sainsburys": "Sainsbury's",
"Sainsburys Local": "Sainsbury's Local",
"Spar": "Spar",
"Spar PFS": "Spar",
"Tesco": "Tesco",
"Tesco Express": "Tesco Express",
"Tesco Express Esso": "Tesco Express",
"Tesco Extra": "Tesco Extra",
"The Co-operative Food": "Co-op",
"The Co-operative Food PFS": "Co-op",
"The Food Warehouse": "The Food Warehouse",
"Waitrose": "Waitrose",
"Waitrose MSA": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
}
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
if display_name is None:
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
return display_name
retailer = retailer.strip()
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
if icon_name is None:
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
return icon_name
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia.strip())
if icon_name is not None:
return icon_name
return normalize_grocery_retailer(retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
min_chain_locations: int = MIN_GROCERY_CHAIN_LOCATIONS,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
@ -1272,6 +1269,11 @@ def transform_grocery_retail_points(
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.with_columns(
pl.col("retailer").str.strip_chars(),
pl.col("fascia").str.strip_chars(),
pl.col("store_name").str.strip_chars(),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
@ -1284,6 +1286,14 @@ def transform_grocery_retail_points(
)
df = df.filter(pl.Series(mask))
eligible_retailers = (
df.group_by("retailer")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("retailer")
)
df = df.join(eligible_retailers, on="retailer", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])