This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -6,6 +6,10 @@ import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
# (transform_grocery_retail_points), so drop OSM supermarkets to avoid
# double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
"shop/supermarket",
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
@ -364,14 +368,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/yes",
],
),
(
"Groceries",
"Supermarket",
"🛒",
[
"shop/supermarket",
],
),
(
"Groceries",
"Convenience Store",
@ -1534,6 +1530,14 @@ def transform(
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
# A single OSM object can carry several tag keys that map to the same
# friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
# which pois.py emits as multiple raw rows sharing one id. Collapse those
# duplicates so they don't inflate downstream proximity counts; rows sharing
# an id with DIFFERENT categories are preserved. Other sources are
# pre-deduplicated.
lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
naptan_df = pl.scan_parquet(naptan_path).collect()
mask = in_england_mask(
boundary_path,