More
This commit is contained in:
parent
cd34ee693f
commit
05a1f316e1
58 changed files with 3113 additions and 1277 deletions
59
pipeline/transform/test_transform_poi.py
Normal file
59
pipeline/transform/test_transform_poi.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.transform_poi import transform_grocery_retail_points
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [101, 102, 103],
|
||||
"retailer": ["Waitrose", "Sainsburys", "The Co-operative Group"],
|
||||
"fascia": ["Waitrose", "Sainsbury's Local", "Co-op Food"],
|
||||
"store_name": ["Waitrose Test", "Sainsbury''s Test", "Co-op Test"],
|
||||
"long_wgs": [-0.141, -0.142, -0.143],
|
||||
"lat_wgs": [51.515, 51.516, 51.517],
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois.select("id", "name", "category", "group", "emoji").to_dicts() == [
|
||||
{
|
||||
"id": "glx-101",
|
||||
"name": "Waitrose Test",
|
||||
"category": "Waitrose",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
{
|
||||
"id": "glx-102",
|
||||
"name": "Sainsbury's Test",
|
||||
"category": "Sainsbury's",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
{
|
||||
"id": "glx-103",
|
||||
"name": "Co-op Test",
|
||||
"category": "Co-op",
|
||||
"group": "Groceries",
|
||||
"emoji": "🛒",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_drops_invalid_rows():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [101, 102],
|
||||
"retailer": ["Waitrose", ""],
|
||||
"fascia": ["Waitrose", "Tesco"],
|
||||
"store_name": ["Waitrose Test", "Tesco Test"],
|
||||
"long_wgs": [-0.141, -0.142],
|
||||
"lat_wgs": [51.515, 51.516],
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois["category"].to_list() == ["Waitrose"]
|
||||
|
|
@ -1058,10 +1058,91 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
COOP_RETAILERS = {
|
||||
"Allendale Co-operative Society",
|
||||
"Central England Co-operative",
|
||||
"Channel Islands Co-operative Society",
|
||||
"Chelmsford Star Co-operative Society",
|
||||
"Clydebank Co-operative",
|
||||
"Coniston Co-operative Society",
|
||||
"East of England Co-operative",
|
||||
"Heart of England Co-operative",
|
||||
"Langdale Co-operative Society",
|
||||
"Lincolnshire Co-operative",
|
||||
"Midcounties Co-operative",
|
||||
"Scottish Midland Co-operative",
|
||||
"Tamworth Co-operative Society",
|
||||
"The Co-operative Group",
|
||||
"The Radstock Co-operative Society",
|
||||
"The Southern Co-operative",
|
||||
}
|
||||
|
||||
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
|
||||
"Cook": "COOK",
|
||||
"Heron": "Heron Foods",
|
||||
"Marks and Spencer": "M&S",
|
||||
"Sainsburys": "Sainsbury's",
|
||||
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
|
||||
}
|
||||
|
||||
|
||||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||
if retailer is None:
|
||||
return ""
|
||||
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
|
||||
|
||||
|
||||
def transform_grocery_retail_points(
|
||||
grocery_df: pl.DataFrame,
|
||||
boundary_path: Path | None = None,
|
||||
) -> pl.DataFrame:
|
||||
"""Convert GEOLYTIX Grocery Retail Points into the POI parquet schema."""
|
||||
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
|
||||
missing = required - set(grocery_df.columns)
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"GEOLYTIX retail points missing columns: {sorted(missing)}"
|
||||
)
|
||||
|
||||
df = (
|
||||
grocery_df.select(
|
||||
pl.col("id").cast(pl.String),
|
||||
pl.col("retailer").cast(pl.String),
|
||||
pl.col("fascia").cast(pl.String),
|
||||
pl.col("store_name").cast(pl.String),
|
||||
pl.col("lat_wgs").cast(pl.Float64).alias("lat"),
|
||||
pl.col("long_wgs").cast(pl.Float64).alias("lng"),
|
||||
)
|
||||
.drop_nulls(["id", "retailer", "lat", "lng"])
|
||||
.filter(pl.col("retailer").str.len_chars() > 0)
|
||||
)
|
||||
|
||||
if boundary_path is not None and len(df) > 0:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
df["lat"].to_numpy(),
|
||||
df["lng"].to_numpy(),
|
||||
)
|
||||
df = df.filter(pl.Series(mask))
|
||||
|
||||
return df.with_columns(
|
||||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||||
pl.coalesce(["store_name", "fascia", "retailer"])
|
||||
.str.replace_all("''", "'")
|
||||
.alias("name"),
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category"),
|
||||
pl.lit("Groceries").alias("group"),
|
||||
pl.lit("🛒").alias("emoji"),
|
||||
).select("id", "name", "category", "group", "lat", "lng", "emoji")
|
||||
|
||||
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
grocery_retail_points_path: Path | None = None,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
|
|
@ -1123,7 +1204,14 @@ def transform(
|
|||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
)
|
||||
return pl.concat([lf, naptan], how="diagonal_relaxed")
|
||||
|
||||
frames = [lf, naptan]
|
||||
if grocery_retail_points_path is not None:
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
frames.append(grocery_pois.lazy())
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -1142,12 +1230,22 @@ def main():
|
|||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--grocery-retail-points",
|
||||
type=Path,
|
||||
help="GEOLYTIX Grocery Retail Points parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||
df = transform(
|
||||
args.input,
|
||||
args.naptan,
|
||||
args.boundary,
|
||||
args.grocery_retail_points,
|
||||
).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue