try

2026-06-04 22:34:26 +01:00 · 2026-06-04 22:34:26 +01:00 · c938b71904
commit c938b71904
parent 843d14b7ba
13 changed files with 698 additions and 109 deletions
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1,6 +1,7 @@
 import argparse
 from pathlib import Path

+import numpy as np
 import polars as pl

 from pipeline.utils.england_geometry import in_england_mask
@ -955,7 +956,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
    # Note: schools come from the GIAS register (see transform_gias_schools).
    # Niche/tertiary education amenities that GIAS does not cover are dropped
    # rather than mixed in with state-funded schools.
-
    (
        "Local Businesses",
        "Hotel",
@ -1441,38 +1441,128 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
-    return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
-        pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
-        pl.col("name"),
-        icon_category_expr.alias("category"),
-        icon_category_expr.alias("icon_category"),
-        pl.lit("Education").alias("group"),
-        pl.col("lat").cast(pl.Float64),
-        pl.col("lng").cast(pl.Float64),
-        emoji_expr.alias("emoji"),
-        pl.col("phase").alias("school_phase"),
-        pl.col("type").alias("school_type"),
-        pl.col("type_group").alias("school_type_group"),
-        pl.col("age_range").alias("school_age_range"),
-        pl.col("gender").alias("school_gender"),
-        pl.col("religious_character").alias("school_religious_character"),
-        pl.col("admissions_policy").alias("school_admissions_policy"),
-        pl.col("nursery_provision").alias("school_nursery_provision"),
-        pl.col("sixth_form").alias("school_sixth_form"),
-        pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
-        pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
-        pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
-        pl.col("trust").alias("school_trust"),
-        pl.col("address").alias("school_address"),
-        pl.col("postcode").alias("school_postcode"),
-        pl.col("local_authority").alias("school_local_authority"),
-        pl.col("website").alias("school_website"),
-        pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
-        pl.col("head_name").alias("school_head_name"),
-        pl.col("ofsted_rating").alias("school_ofsted_rating"),
+    return (
+        pl.scan_parquet(gias_path)
+        .join(ofsted, on="urn", how="left")
+        .select(
+            pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
+            pl.col("name"),
+            icon_category_expr.alias("category"),
+            icon_category_expr.alias("icon_category"),
+            pl.lit("Education").alias("group"),
+            pl.col("lat").cast(pl.Float64),
+            pl.col("lng").cast(pl.Float64),
+            emoji_expr.alias("emoji"),
+            pl.col("phase").alias("school_phase"),
+            pl.col("type").alias("school_type"),
+            pl.col("type_group").alias("school_type_group"),
+            pl.col("age_range").alias("school_age_range"),
+            pl.col("gender").alias("school_gender"),
+            pl.col("religious_character").alias("school_religious_character"),
+            pl.col("admissions_policy").alias("school_admissions_policy"),
+            pl.col("nursery_provision").alias("school_nursery_provision"),
+            pl.col("sixth_form").alias("school_sixth_form"),
+            pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
+            pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
+            pl.col("fsm_percent")
+            .cast(pl.Float32, strict=False)
+            .alias("school_fsm_percent"),
+            pl.col("trust").alias("school_trust"),
+            pl.col("address").alias("school_address"),
+            pl.col("postcode").alias("school_postcode"),
+            pl.col("local_authority").alias("school_local_authority"),
+            pl.col("website").alias("school_website"),
+            pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
+            pl.col("head_name").alias("school_head_name"),
+            pl.col("ofsted_rating").alias("school_ofsted_rating"),
+        )
    )


+# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
+# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
+# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
+# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
+# sits on top of a GEOLYTIX point AND carries that point's brand name is the
+# same physical store and is dropped. Independent corner shops never carry a
+# chain brand, so they are kept.
+GROCERY_DEDUP_RADIUS_M = 50.0
+
+# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
+# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
+# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
+# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
+_GROCERY_TOKEN_ALIASES = {
+    "cooperative": "coop",
+    "cooperatives": "coop",
+}
+
+
+def _significant_tokens(name: str | None) -> set[str]:
+    """Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
+    if not name:
+        return set()
+    tokens: set[str] = set()
+    for raw in str(name).lower().split():
+        token = "".join(ch for ch in raw if ch.isalnum())
+        if len(token) >= 3:
+            tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
+    return tokens
+
+
+def osm_groceries_colocated_with_geolytix(
+    osm_groceries: pl.DataFrame,
+    geolytix: pl.DataFrame,
+    radius_m: float = GROCERY_DEDUP_RADIUS_M,
+) -> list[str]:
+    """Return OSM grocery ids that duplicate a GEOLYTIX store.
+
+    An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
+    ``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
+    "Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
+    physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
+    match, so they are conservatively kept rather than risk a false drop.
+
+    ``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
+    ``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
+    """
+    if osm_groceries.is_empty() or geolytix.is_empty():
+        return []
+
+    from scipy.spatial import cKDTree
+
+    glx_lat = geolytix["lat"].to_numpy().astype(float)
+    glx_lng = geolytix["lng"].to_numpy().astype(float)
+    glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
+
+    osm_lat = osm_groceries["lat"].to_numpy().astype(float)
+    osm_lng = osm_groceries["lng"].to_numpy().astype(float)
+    osm_ids = osm_groceries["id"].to_list()
+    osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
+
+    # Equirectangular projection to metres around the shared mean latitude — at
+    # England's scale this is accurate to well under the dedup radius.
+    mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
+    cos_lat = float(np.cos(np.radians(mean_lat)))
+    glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
+    osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
+
+    tree = cKDTree(glx_xy)
+    neighbours = tree.query_ball_point(osm_xy, r=radius_m)
+
+    drop_ids: list[str] = []
+    for osm_idx, glx_indices in enumerate(neighbours):
+        tokens = osm_name_tokens[osm_idx]
+        if not tokens:
+            continue
+        for glx_idx in glx_indices:
+            brand = glx_brand_tokens[glx_idx]
+            if brand and brand.issubset(tokens):
+                drop_ids.append(osm_ids[osm_idx])
+                break
+    return drop_ids
+
+
 def transform(
    input_path: Path,
    naptan_path: Path,
@ -1553,6 +1643,27 @@ def transform(

    grocery_df = pl.read_parquet(grocery_retail_points_path)
    grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
+
+    # Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
+    # colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
+    osm_groceries = (
+        lf.filter(pl.col("group") == "Groceries")
+        .select("id", "name", "lat", "lng")
+        .collect(engine="streaming")
+    )
+    duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
+    if duplicate_ids:
+        print(
+            f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
+            "GEOLYTIX store"
+        )
+        # Scope the drop to the Groceries group: a single OSM object can also
+        # carry a non-grocery aspect (e.g. a convenience store that is also a
+        # Post Office), which must survive — only its duplicate grocery row goes.
+        lf = lf.filter(
+            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
+        )
+
    frames = [
        lf,
        naptan,