import polars as pl import pytest from pipeline.download.naptan import ( TRAM_METRO_CATEGORY, TUBE_STATION_CATEGORY, canonical_station_name, canonical_station_name_expr, deduplicate_naptan, filter_active_stops, ) def test_canonical_station_name_expr_normalizes_transport_suffixes(): names = [ "Bank", "Bank Underground Station", "Bank DLR Station", "Pleasure Beach (Blackpool Tramway)", "Earl's Court Tube Station", ] df = pl.DataFrame( { "name": names, } ) result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list() assert result == [ "bank", "bank", "bank", "pleasure beach", "earls court", ] assert [canonical_station_name(name) for name in names] == result def test_canonical_station_name_strips_entrance_suffixes(): # Real shipped NaPTAN entrance names that previously failed to merge with # their station node (79 stray entrance POIs). cases = { "Weaste Metrolink Station North East Entrance": "weaste", "Weaste Metrolink Station North Entrance No 2": "weaste", "Whitefield Metrolink Station Main Entrance": "whitefield", "Radcliffe Metrolink Station Entrance": "radcliffe", "Stretford Metrolink Station Wt Platform Entrance": "stretford", "Salford Quays Metrolink Station SW entrance": "salford quays", "Bank Station Ent 2": "bank", "Hainault": "hainault", # The Metrolink MET node names collapse to the same key. "Weaste (Manchester Metrolink)": "weaste", # No entrance word: direction/filler words must NOT be stripped. "Maze Hill North": "maze hill north", "Bus Station Entrance": "bus", # Bus-station bay/stand designators collapse to the station name… "Tonypandy Bus Station Stand A3": "tonypandy bus", "Caerphilly Interchange Stand 5": "caerphilly interchange", "Stanley Bus Station Stand G": "stanley bus", # …but a bare trailing "Bay" (place names) is untouched. "Colwyn Bay": "colwyn bay", } for name, expected in cases.items(): assert canonical_station_name(name) == expected, name df = pl.DataFrame({"name": list(cases.keys())}) expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list() assert expr_result == list(cases.values()) def test_filter_active_stops_drops_non_active(): df = pl.DataFrame( { "ATCOCode": ["a", "b", "c", "d"], "Status": ["active", "inactive", None, "Pending"], } ) result = filter_active_stops(df) # Active and unknown (null) statuses survive; inactive/pending are dropped. assert result["ATCOCode"].to_list() == ["a", "c"] def test_filter_active_stops_tolerates_missing_status_column(): df = pl.DataFrame({"ATCOCode": ["a"]}) assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"] def test_deduplicate_naptan_splits_london_underground_from_tram_metro(): # MET station nodes plus TMU entrances, pre-categorised as the tram/metro # family. The Hainault group contains a 940GZZLU station node, so the # merged POI is a genuine "Tube station" even though its entrance carries a # non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop". df = pl.DataFrame( { "id": [ "940GZZLUHLT", "490000095003", "9400ZZMAWST", "1800NFR2691", ], "name": [ "Hainault Underground Station", "Hainault", "Weaste (Manchester Metrolink)", "Weaste Metrolink Station North West Entrance", ], "category": [TRAM_METRO_CATEGORY] * 4, "lat": [51.6034, 51.6037, 53.4826, 53.4826], "lng": [0.0933, 0.0931, -2.3087, -2.3086], "locality": [None, None, None, None], "entrance": [False, True, False, True], "is_lu": [True, False, False, False], } ) result = deduplicate_naptan(df).sort("category") assert len(result) == 2 assert result["category"].to_list() == [ TRAM_METRO_CATEGORY, TUBE_STATION_CATEGORY, ] tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY) # The station node (not the entrance) represents the merged POI. assert tube["id"][0] == "940GZZLUHLT" tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY) assert tram["id"][0] == "9400ZZMAWST" def test_deduplicate_naptan_merges_bus_station_bays_and_entrances(): # BCS bays and a BCE entrance of one bus station collapse to a single POI # represented by a non-entrance node; a different bus station in another # area survives separately. df = pl.DataFrame( { "id": ["bay-1", "bay-2", "ent-1", "other"], "name": [ "Bury Interchange", "Bury Interchange", "Bury Interchange East Entrance", "Rochdale Interchange", ], "category": ["Bus station"] * 4, "lat": [53.5907, 53.5908, 53.5909, 53.6160], "lng": [-2.2958, -2.2957, -2.2956, -2.1561], "locality": ["BURY", "BURY", "BURY", "ROCHDALE"], "entrance": [False, False, True, False], } ) result = deduplicate_naptan(df).sort("name") assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"] assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1" def test_deduplicate_naptan_merges_tube_station_variants_by_area(): df = pl.DataFrame( { "id": [ "bank", "bank-lu", "bank-dlr", "other-bank", "central-a", "central-b", ], "name": [ "Bank", "Bank Underground Station", "Bank DLR Station", "Bank Underground Station", "Central Tube Station", "Central Tube Station", ], "category": ["Tube station"] * 6, "lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0], "lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0], "locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None], } ) result = deduplicate_naptan(df).sort("lat") assert len(result) == 4 assert result["name"].to_list() == [ "Central Tube Station", "Bank", "Central Tube Station", "Bank Underground Station", ] assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx( (51.5129 + 51.5134 + 51.5132) / 3 ) def test_deduplicate_naptan_keeps_distinct_stations_with_conflicting_qualifiers(): """The two Edgware Road stations are ~150m apart and differ only by the parenthetical line name, which the canonical key strips. Conflicting parentheticals must block the area merge; an unqualified entrance row can still join either group.""" df = pl.DataFrame( { "id": ["bakerloo", "circle", "entrance"], "name": [ "Edgware Road (Bakerloo) Underground Station", "Edgware Road (Circle/District) Underground Station", "Edgware Road Underground Station", ], "category": ["Tube station"] * 3, "lat": [51.5204, 51.5199, 51.5203], "lng": [-0.1700, -0.1679, -0.1701], "locality": ["LOC1"] * 3, } ) result = deduplicate_naptan(df).sort("lng") assert len(result) == 2 assert result["name"].to_list() == [ "Edgware Road (Bakerloo) Underground Station", "Edgware Road (Circle/District) Underground Station", ] # The unqualified entrance merged into the Bakerloo group (averaged lat). assert result["lat"][0] == pytest.approx((51.5204 + 51.5203) / 2) def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops(): df = pl.DataFrame( { "id": ["a", "b"], "name": ["High Street", "High Street"], "category": ["Bus stop", "Bus stop"], "lat": [51.5, 52.5], "lng": [-0.1, -1.1], "locality": [None, None], } ) result = deduplicate_naptan(df) assert len(result) == 2 def test_deduplicate_naptan_merges_colocated_missing_locality_bus_stations(): # Two NaPTAN records for the same bus station with no locality, co-located # within the merge area, are a true duplicate and collapse to one POI. df = pl.DataFrame( { "id": ["a", "b"], "name": ["Victoria Bus Station", "Victoria Bus Station"], "category": ["Bus station", "Bus station"], "lat": [51.4952, 51.4953], "lng": [-0.1441, -0.1440], "locality": [None, None], } ) result = deduplicate_naptan(df) assert len(result) == 1 assert result["name"][0] == "Victoria Bus Station" assert result["category"][0] == "Bus station" assert result["lat"][0] == pytest.approx((51.4952 + 51.4953) / 2) def test_deduplicate_naptan_keeps_rail_station_with_only_station_node(): # Aberdare's only NaPTAN record is an RLY station node (StopType "RLY"). df = pl.DataFrame( { "id": ["aberdare-rly"], "name": ["Aberdare Rail Station"], "category": ["Rail station"], "lat": [51.7155], "lng": [-3.4438], "locality": ["ABERDARE"], "entrance": [False], } ) result = deduplicate_naptan(df) assert len(result) == 1 assert result["name"][0] == "Aberdare Rail Station" assert result["category"][0] == "Rail station" def test_deduplicate_naptan_merges_rail_entrances_into_station_node(): # A station node (RLY) and its two entrance nodes (RSE) collapse to a single # "Rail station" POI represented by the station node, not an entrance. df = pl.DataFrame( { "id": ["clapham-rly", "clapham-rse-a", "clapham-rse-b"], "name": [ "Clapham Junction Rail Station", "Clapham Junction Rail Station", "Clapham Junction Rail Station", ], "category": ["Rail station", "Rail station", "Rail station"], "lat": [51.4642, 51.4644, 51.4640], "lng": [-0.1705, -0.1702, -0.1708], "locality": ["CLAPHAM", "CLAPHAM", "CLAPHAM"], "entrance": [False, True, True], } ) result = deduplicate_naptan(df) assert len(result) == 1 assert result["id"][0] == "clapham-rly" assert result["category"][0] == "Rail station" def test_deduplicate_naptan_does_not_merge_rail_and_ferry_in_same_area(): # Different transport modes sharing a name/area stay as separate POIs. df = pl.DataFrame( { "id": ["harbour-rail", "harbour-ferry"], "name": ["Harbour Station", "Harbour Station"], "category": ["Rail station", "Ferry"], "lat": [51.5, 51.5001], "lng": [-0.1, -0.1001], "locality": ["HARBOUR", "HARBOUR"], "entrance": [False, False], } ) result = deduplicate_naptan(df).sort("category") assert len(result) == 2 assert result["category"].to_list() == ["Ferry", "Rail station"]