More
Some checks failed
CI / Check (push) Failing after 2m14s
Build and publish Docker image / build-and-push (push) Failing after 2m38s

This commit is contained in:
Andras Schmelczer 2026-05-04 17:21:26 +01:00
parent cd34ee693f
commit 05a1f316e1
58 changed files with 3113 additions and 1277 deletions

View file

@ -1058,10 +1058,91 @@ NAPTAN_EMOJIS: dict[str, str] = {
}
COOP_RETAILERS = {
"Allendale Co-operative Society",
"Central England Co-operative",
"Channel Islands Co-operative Society",
"Chelmsford Star Co-operative Society",
"Clydebank Co-operative",
"Coniston Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Langdale Co-operative Society",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Co-operative Group",
"The Radstock Co-operative Society",
"The Southern Co-operative",
}
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
"Cook": "COOK",
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
}
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
def transform_grocery_retail_points(
grocery_df: pl.DataFrame,
boundary_path: Path | None = None,
) -> pl.DataFrame:
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
missing = required - set(grocery_df.columns)
if missing:
raise ValueError(
f"GEOLYTIX retail points missing columns: {sorted(missing)}"
)
df = (
grocery_df.select(
pl.col("id").cast(pl.String),
pl.col("retailer").cast(pl.String),
pl.col("fascia").cast(pl.String),
pl.col("store_name").cast(pl.String),
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
)
.drop_nulls(["id", "retailer", "lat", "lng"])
.filter(pl.col("retailer").str.len_chars() > 0)
)
if boundary_path is not None and len(df) > 0:
mask = in_england_mask(
boundary_path,
df["lat"].to_numpy(),
df["lng"].to_numpy(),
)
df = df.filter(pl.Series(mask))
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category"),
pl.lit("Groceries").alias("group"),
pl.lit("🛒").alias("emoji"),
).select("id", "name", "category", "group", "lat", "lng", "emoji")
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
grocery_retail_points_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1123,7 +1204,14 @@ def transform(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
return pl.concat([lf, naptan], how="diagonal_relaxed")
frames = [lf, naptan]
if grocery_retail_points_path is not None:
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames.append(grocery_pois.lazy())
return pl.concat(frames, how="diagonal_relaxed")
def main():
@ -1142,12 +1230,22 @@ def main():
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--grocery-retail-points",
type=Path,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df = transform(
args.input,
args.naptan,
args.boundary,
args.grocery_retail_points,
).collect(engine="streaming")
df.write_parquet(args.output)