Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
    assert n2_grocery.height == 1


+def test_transform_drops_miscategorised_tags(tmp_path):
+    # Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
+    # slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
+    # alternative medicine), Hospital & Clinic (untyped healthcare/yes),
+    # Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
+    # apparatus). They must be dropped entirely.
+    dropped = [
+        "amenity/bicycle_rental",
+        "amenity/boat_rental",
+        "leisure/marina",
+        "leisure/slipway",
+        "tourism/artwork",
+        "healthcare/yes",
+        "healthcare/alternative",
+        "shop/herbalist",
+        "shop/health",
+        "amenity/fountain",
+        "amenity/courthouse",
+        "leisure/fitness_station",
+    ]
+    raw = pl.DataFrame(
+        {
+            "id": [f"n{i}" for i in range(len(dropped))],
+            "name": [f"POI {i}" for i in range(len(dropped))],
+            "category": dropped,
+            "lat": [51.50] * len(dropped),
+            "lng": [-0.10] * len(dropped),
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
+
+
+def test_transform_splits_hospital_and_clinic(tmp_path):
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3"],
+            "name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
+            "category": [
+                "amenity/hospital",
+                "amenity/clinic",
+                "healthcare/clinic",
+            ],
+            "lat": [51.50, 51.51, 51.52],
+            "lng": [-0.10, -0.11, -0.12],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
+    assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
+    assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
+    assert "Hospital & Clinic" not in out["category"].to_list()
+
+
+def test_transform_maps_chalet_to_hotel(tmp_path):
+    # Holiday-let chalets are accommodation, not Tourist Attractions.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["Seaview Chalet"],
+            "category": ["tourism/chalet"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
+
+
+def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
+    # leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
+    # unnamed (anonymous tracks/gallops/fishing spots); only named public
+    # facilities survive as a Sports Centre.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3", "n4"],
+            "name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
+            "category": [
+                "leisure/track",
+                "leisure/fishing",
+                "leisure/track",
+                "leisure/horse_riding",
+            ],
+            "lat": [51.50, 51.51, 51.52, 51.53],
+            "lng": [-0.10, -0.11, -0.12, -0.13],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
+    named = out.filter(pl.col("id").is_in(["n3", "n4"]))
+    assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
+
+
+def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
+    # NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
+    # flow through with the Public Transport group and its own emoji.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["A Cafe"],
+            "category": ["amenity/cafe"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+    pl.DataFrame(
+        {
+            "id": ["naptan-1", "naptan-2"],
+            "name": ["Test Rail Station", "Weaste"],
+            "category": ["Rail station", "Tram & Metro stop"],
+            "lat": [51.51, 51.52],
+            "lng": [-0.13, -0.14],
+        }
+    ).write_parquet(inputs["naptan_path"])
+
+    out = transform(**inputs).collect()
+
+    tram = out.filter(pl.col("category") == "Tram & Metro stop")
+    assert tram.height == 1
+    assert tram["group"].to_list() == ["Public Transport"]
+    assert tram["emoji"].to_list() == ["🚊"]
+
+
 def test_transform_output_unique_per_id_category(tmp_path):
    # Soundness: the full transform() output has at most one row per
    # (id, category) overall, across every source.