import polars as pl import pytest from pipeline.download.naptan import ( canonical_station_name, canonical_station_name_expr, deduplicate_naptan, ) def test_canonical_station_name_expr_normalizes_transport_suffixes(): names = [ "Bank", "Bank Underground Station", "Bank DLR Station", "Pleasure Beach (Blackpool Tramway)", "Earl's Court Tube Station", ] df = pl.DataFrame( { "name": names, } ) result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list() assert result == [ "bank", "bank", "bank", "pleasure beach", "earls court", ] assert [canonical_station_name(name) for name in names] == result def test_deduplicate_naptan_merges_tube_station_variants_by_area(): df = pl.DataFrame( { "id": [ "bank", "bank-lu", "bank-dlr", "other-bank", "central-a", "central-b", ], "name": [ "Bank", "Bank Underground Station", "Bank DLR Station", "Bank Underground Station", "Central Tube Station", "Central Tube Station", ], "category": ["Tube station"] * 6, "lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0], "lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0], "locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None], } ) result = deduplicate_naptan(df).sort("lat") assert len(result) == 4 assert result["name"].to_list() == [ "Central Tube Station", "Bank", "Central Tube Station", "Bank Underground Station", ] assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx( (51.5129 + 51.5134 + 51.5132) / 3 ) def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops(): df = pl.DataFrame( { "id": ["a", "b"], "name": ["High Street", "High Street"], "category": ["Bus stop", "Bus stop"], "lat": [51.5, 52.5], "lng": [-0.1, -1.1], "locality": [None, None], } ) result = deduplicate_naptan(df) assert len(result) == 2