try

2026-06-04 22:34:26 +01:00 · 2026-06-04 22:34:26 +01:00 · c938b71904
commit c938b71904
parent 843d14b7ba
13 changed files with 698 additions and 109 deletions
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -5,11 +5,72 @@ import polars as pl
 from pipeline.transform.transform_poi import (
    _load_ofsted_ratings,
    _school_icon_category_expr,
+    osm_groceries_colocated_with_geolytix,
    transform,
    transform_grocery_retail_points,
 )


+def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
+    # GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
+    # of a GEOLYTIX store AND carries its brand is the same physical store and
+    # must be dropped; an independent shop at the same spot, and a same-brand
+    # store far from any GEOLYTIX point, must be kept.
+    geolytix = pl.DataFrame(
+        {
+            "category": ["Tesco"],
+            "lat": [51.5000],
+            "lng": [-0.1000],
+        }
+    )
+    osm = pl.DataFrame(
+        {
+            "id": ["dup-brand", "independent", "far-brand"],
+            "name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
+            # ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
+            "lat": [51.50001, 51.50002, 52.0],
+            "lng": [-0.10001, -0.1000, -1.0],
+        }
+    )
+
+    drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
+
+    assert drop_ids == ["dup-brand"]
+
+
+def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
+    # GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
+    # "The Co-operative Food" -> "cooperative". The alias folds them so the
+    # genuine duplicate is still dropped.
+    geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
+    osm = pl.DataFrame(
+        {
+            "id": ["coop-dup"],
+            "name": ["The Co-operative Food"],
+            "lat": [53.00001],
+            "lng": [-1.5],
+        }
+    )
+    assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
+        "coop-dup"
+    ]
+
+
+def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
+    geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
+    empty = pl.DataFrame(
+        schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
+    )
+    assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
+    osm = pl.DataFrame(
+        {"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
+    )
+    empty_glx = pl.DataFrame(
+        schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
+    )
+    assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
+
+
 def _write_boundary(tmp_path):
    """A FeatureCollection whose single feature covers the London-area test
    coords used by the transform() fixtures, so in_england_mask keeps them."""
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
        }
    ).write_parquet(ofsted_path)

-    ratings = (
-        _load_ofsted_ratings(ofsted_path)
-        .collect()
-        .sort("urn")
-        .to_dicts()
-    )
+    ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()

    assert ratings == [
        {"urn": 1, "ofsted_rating": "Outstanding"},
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
        },
    )

-    categories = df.select(
-        _school_icon_category_expr().alias("category")
-    )["category"].to_list()
+    categories = df.select(_school_icon_category_expr().alias("category"))[
+        "category"
+    ].to_list()

    assert categories == [
        "Nursery school",
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
    assert convenience.height == 1


+def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
+    # The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
+    # (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
+    # is the same physical store, so its Convenience Store (Groceries) row is a
+    # duplicate and must be dropped — but its NON-grocery aspect (a Post Office
+    # sharing the same OSM id) must survive. An independent shop away from the
+    # GEOLYTIX point keeps its grocery row.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n1", "n2"],
+            "name": ["Tesco Express", "Tesco Express", "Corner Shop"],
+            "category": [
+                "shop/convenience",
+                "amenity/post_office",
+                "shop/convenience",
+            ],
+            "lat": [51.52, 51.52, 51.40],
+            "lng": [-0.14, -0.14, -0.05],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    # The colocated, brand-matched grocery row is dropped.
+    n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
+    assert n1_grocery.height == 0
+    # Its non-grocery aspect (Post Office) survives.
+    n1_post_office = out.filter(
+        (pl.col("id") == "n1") & (pl.col("category") == "Post Office")
+    )
+    assert n1_post_office.height == 1
+    # The independent corner shop (no brand, far away) keeps its grocery row.
+    n2_grocery = out.filter(
+        (pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
+    )
+    assert n2_grocery.height == 1
+
+
 def test_transform_output_unique_per_id_category(tmp_path):
    # Soundness: the full transform() output has at most one row per
    # (id, category) overall, across every source.