idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -1,12 +1,115 @@
+import json
+
 import polars as pl

 from pipeline.transform.transform_poi import (
    _load_ofsted_ratings,
    _school_icon_category_expr,
+    transform,
    transform_grocery_retail_points,
 )


+def _write_boundary(tmp_path):
+    """A FeatureCollection whose single feature covers the London-area test
+    coords used by the transform() fixtures, so in_england_mask keeps them."""
+    boundary_path = tmp_path / "england.geojson"
+    coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
+    boundary_path.write_text(
+        json.dumps(
+            {
+                "type": "FeatureCollection",
+                "features": [
+                    {
+                        "type": "Feature",
+                        "properties": {},
+                        "geometry": {"type": "Polygon", "coordinates": [coords]},
+                    }
+                ],
+            }
+        )
+    )
+    return boundary_path
+
+
+def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
+    """Materialise the parquet inputs transform() requires around a given raw
+    OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
+    input_path = tmp_path / "pois.parquet"
+    raw_pois.write_parquet(input_path)
+
+    naptan_path = tmp_path / "naptan.parquet"
+    pl.DataFrame(
+        {
+            "id": ["naptan-1"],
+            "name": ["Test Rail Station"],
+            "category": ["Rail station"],
+            "lat": [51.51],
+            "lng": [-0.13],
+        }
+    ).write_parquet(naptan_path)
+
+    grocery_path = tmp_path / "grocery.parquet"
+    pl.DataFrame(
+        {
+            "id": list(range(1, 6)),
+            "retailer": ["Tesco"] * 5,
+            "fascia": ["Tesco"] * 5,
+            "store_name": [f"Tesco Test {i}" for i in range(1, 6)],
+            "long_wgs": [-0.14] * 5,
+            "lat_wgs": [51.52] * 5,
+        }
+    ).write_parquet(grocery_path)
+
+    gias_path = tmp_path / "gias.parquet"
+    pl.DataFrame(
+        {
+            "urn": [1001],
+            "name": ["Test Primary School"],
+            "phase": ["Primary"],
+            "type": ["Community school"],
+            "type_group": ["Local authority maintained schools"],
+            "age_range": ["4–11"],
+            "gender": ["Mixed"],
+            "religious_character": [None],
+            "admissions_policy": ["Comprehensive"],
+            "nursery_provision": ["No"],
+            "sixth_form": ["No"],
+            "capacity": [200],
+            "pupils": [180],
+            "fsm_percent": [12.5],
+            "trust": [None],
+            "address": ["1 Test Street"],
+            "postcode": ["E1 1AA"],
+            "local_authority": ["Test LA"],
+            "website": [None],
+            "telephone": ["02012345678"],
+            "head_name": ["Jane Doe"],
+            "lat": [51.53],
+            "lng": [-0.12],
+        }
+    ).write_parquet(gias_path)
+
+    ofsted_path = tmp_path / "ofsted.parquet"
+    pl.DataFrame(
+        {
+            "URN": [1001],
+            "Latest OEIF overall effectiveness": ["2"],
+            "Ungraded inspection overall outcome": [None],
+        }
+    ).write_parquet(ofsted_path)
+
+    boundary_path = _write_boundary(tmp_path)
+    return {
+        "input_path": input_path,
+        "naptan_path": naptan_path,
+        "boundary_path": boundary_path,
+        "grocery_retail_points_path": grocery_path,
+        "gias_path": gias_path,
+        "ofsted_path": ofsted_path,
+    }
+
+
 def test_transform_grocery_retail_points_outputs_chain_categories():
    raw = pl.DataFrame(
        {
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
        "Primary school",
        "School",
    ]
+
+
+def test_transform_dedupes_multi_tag_pois(tmp_path):
+    # One OSM object can carry several tag keys that map to the SAME friendly
+    # category, so pois.py emits one raw row per key with the SAME id.
+    # "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
+    raw = pl.DataFrame(
+        {
+            "id": ["n42", "n42"],
+            "name": ["Boots", "Boots"],
+            "category": ["amenity/pharmacy", "shop/chemist"],
+            "lat": [51.50, 51.50],
+            "lng": [-0.10, -0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    # No (id, category) pair appears more than once.
+    assert out.group_by("id", "category").len()["len"].max() == 1
+    # The single physical pharmacy is present exactly once.
+    pharmacies = out.filter(
+        (pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
+    )
+    assert pharmacies.height == 1
+
+
+def test_osm_supermarkets_dropped(tmp_path):
+    # GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
+    # must not flow through as a second Groceries/Supermarket pin. A
+    # complementary grocery category (Convenience Store) must still survive.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2"],
+            "name": ["Some Supermarket", "Corner Shop"],
+            "category": ["shop/supermarket", "shop/convenience"],
+            "lat": [51.50, 51.51],
+            "lng": [-0.10, -0.11],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    osm_supermarkets = out.filter(
+        (pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
+    )
+    assert osm_supermarkets.height == 0
+    # Complementary OSM grocery category survives.
+    convenience = out.filter(pl.col("category") == "Convenience Store")
+    assert convenience.height == 1
+
+
+def test_transform_output_unique_per_id_category(tmp_path):
+    # Soundness: the full transform() output has at most one row per
+    # (id, category) overall, across every source.
+    raw = pl.DataFrame(
+        {
+            "id": ["n42", "n42", "n7", "n8"],
+            "name": ["Boots", "Boots", "St Mary's", "St Mary's"],
+            "category": [
+                "amenity/pharmacy",
+                "shop/chemist",
+                "amenity/place_of_worship",
+                "building/church",
+            ],
+            "lat": [51.50, 51.50, 51.55, 51.55],
+            "lng": [-0.10, -0.10, -0.15, -0.15],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.group_by("id", "category").len()["len"].max() == 1