304 lines
10 KiB
Python
304 lines
10 KiB
Python
import polars as pl
|
|
import pytest
|
|
|
|
from pipeline.download.naptan import (
|
|
TRAM_METRO_CATEGORY,
|
|
TUBE_STATION_CATEGORY,
|
|
canonical_station_name,
|
|
canonical_station_name_expr,
|
|
deduplicate_naptan,
|
|
filter_active_stops,
|
|
)
|
|
|
|
|
|
def test_canonical_station_name_expr_normalizes_transport_suffixes():
|
|
names = [
|
|
"Bank",
|
|
"Bank Underground Station",
|
|
"Bank DLR Station",
|
|
"Pleasure Beach (Blackpool Tramway)",
|
|
"Earl's Court Tube Station",
|
|
]
|
|
df = pl.DataFrame(
|
|
{
|
|
"name": names,
|
|
}
|
|
)
|
|
|
|
result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
|
|
|
|
assert result == [
|
|
"bank",
|
|
"bank",
|
|
"bank",
|
|
"pleasure beach",
|
|
"earls court",
|
|
]
|
|
assert [canonical_station_name(name) for name in names] == result
|
|
|
|
|
|
def test_canonical_station_name_strips_entrance_suffixes():
|
|
# Real shipped NaPTAN entrance names that previously failed to merge with
|
|
# their station node (79 stray entrance POIs).
|
|
cases = {
|
|
"Weaste Metrolink Station North East Entrance": "weaste",
|
|
"Weaste Metrolink Station North Entrance No 2": "weaste",
|
|
"Whitefield Metrolink Station Main Entrance": "whitefield",
|
|
"Radcliffe Metrolink Station Entrance": "radcliffe",
|
|
"Stretford Metrolink Station Wt Platform Entrance": "stretford",
|
|
"Salford Quays Metrolink Station SW entrance": "salford quays",
|
|
"Bank Station Ent 2": "bank",
|
|
"Hainault": "hainault",
|
|
# The Metrolink MET node names collapse to the same key.
|
|
"Weaste (Manchester Metrolink)": "weaste",
|
|
# No entrance word: direction/filler words must NOT be stripped.
|
|
"Maze Hill North": "maze hill north",
|
|
"Bus Station Entrance": "bus",
|
|
# Bus-station bay/stand designators collapse to the station name…
|
|
"Tonypandy Bus Station Stand A3": "tonypandy bus",
|
|
"Caerphilly Interchange Stand 5": "caerphilly interchange",
|
|
"Stanley Bus Station Stand G": "stanley bus",
|
|
# …but a bare trailing "Bay" (place names) is untouched.
|
|
"Colwyn Bay": "colwyn bay",
|
|
}
|
|
for name, expected in cases.items():
|
|
assert canonical_station_name(name) == expected, name
|
|
|
|
df = pl.DataFrame({"name": list(cases.keys())})
|
|
expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
|
|
assert expr_result == list(cases.values())
|
|
|
|
|
|
def test_filter_active_stops_drops_non_active():
|
|
df = pl.DataFrame(
|
|
{
|
|
"ATCOCode": ["a", "b", "c", "d"],
|
|
"Status": ["active", "inactive", None, "Pending"],
|
|
}
|
|
)
|
|
|
|
result = filter_active_stops(df)
|
|
|
|
# Active and unknown (null) statuses survive; inactive/pending are dropped.
|
|
assert result["ATCOCode"].to_list() == ["a", "c"]
|
|
|
|
|
|
def test_filter_active_stops_tolerates_missing_status_column():
|
|
df = pl.DataFrame({"ATCOCode": ["a"]})
|
|
|
|
assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
|
|
|
|
|
|
def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
|
|
# MET station nodes plus TMU entrances, pre-categorised as the tram/metro
|
|
# family. The Hainault group contains a 940GZZLU station node, so the
|
|
# merged POI is a genuine "Tube station" even though its entrance carries a
|
|
# non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": [
|
|
"940GZZLUHLT",
|
|
"490000095003",
|
|
"9400ZZMAWST",
|
|
"1800NFR2691",
|
|
],
|
|
"name": [
|
|
"Hainault Underground Station",
|
|
"Hainault",
|
|
"Weaste (Manchester Metrolink)",
|
|
"Weaste Metrolink Station North West Entrance",
|
|
],
|
|
"category": [TRAM_METRO_CATEGORY] * 4,
|
|
"lat": [51.6034, 51.6037, 53.4826, 53.4826],
|
|
"lng": [0.0933, 0.0931, -2.3087, -2.3086],
|
|
"locality": [None, None, None, None],
|
|
"entrance": [False, True, False, True],
|
|
"is_lu": [True, False, False, False],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df).sort("category")
|
|
|
|
assert len(result) == 2
|
|
assert result["category"].to_list() == [
|
|
TRAM_METRO_CATEGORY,
|
|
TUBE_STATION_CATEGORY,
|
|
]
|
|
tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
|
|
# The station node (not the entrance) represents the merged POI.
|
|
assert tube["id"][0] == "940GZZLUHLT"
|
|
tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
|
|
assert tram["id"][0] == "9400ZZMAWST"
|
|
|
|
|
|
def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
|
|
# BCS bays and a BCE entrance of one bus station collapse to a single POI
|
|
# represented by a non-entrance node; a different bus station in another
|
|
# area survives separately.
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["bay-1", "bay-2", "ent-1", "other"],
|
|
"name": [
|
|
"Bury Interchange",
|
|
"Bury Interchange",
|
|
"Bury Interchange East Entrance",
|
|
"Rochdale Interchange",
|
|
],
|
|
"category": ["Bus station"] * 4,
|
|
"lat": [53.5907, 53.5908, 53.5909, 53.6160],
|
|
"lng": [-2.2958, -2.2957, -2.2956, -2.1561],
|
|
"locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
|
|
"entrance": [False, False, True, False],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df).sort("name")
|
|
|
|
assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
|
|
assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
|
|
|
|
|
|
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": [
|
|
"bank",
|
|
"bank-lu",
|
|
"bank-dlr",
|
|
"other-bank",
|
|
"central-a",
|
|
"central-b",
|
|
],
|
|
"name": [
|
|
"Bank",
|
|
"Bank Underground Station",
|
|
"Bank DLR Station",
|
|
"Bank Underground Station",
|
|
"Central Tube Station",
|
|
"Central Tube Station",
|
|
],
|
|
"category": ["Tube station"] * 6,
|
|
"lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0],
|
|
"lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0],
|
|
"locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df).sort("lat")
|
|
|
|
assert len(result) == 4
|
|
assert result["name"].to_list() == [
|
|
"Central Tube Station",
|
|
"Bank",
|
|
"Central Tube Station",
|
|
"Bank Underground Station",
|
|
]
|
|
assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx(
|
|
(51.5129 + 51.5134 + 51.5132) / 3
|
|
)
|
|
|
|
|
|
def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops():
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["a", "b"],
|
|
"name": ["High Street", "High Street"],
|
|
"category": ["Bus stop", "Bus stop"],
|
|
"lat": [51.5, 52.5],
|
|
"lng": [-0.1, -1.1],
|
|
"locality": [None, None],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df)
|
|
|
|
assert len(result) == 2
|
|
|
|
|
|
def test_deduplicate_naptan_merges_colocated_missing_locality_bus_stations():
|
|
# Two NaPTAN records for the same bus station with no locality, co-located
|
|
# within the merge area, are a true duplicate and collapse to one POI.
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["a", "b"],
|
|
"name": ["Victoria Bus Station", "Victoria Bus Station"],
|
|
"category": ["Bus station", "Bus station"],
|
|
"lat": [51.4952, 51.4953],
|
|
"lng": [-0.1441, -0.1440],
|
|
"locality": [None, None],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df)
|
|
|
|
assert len(result) == 1
|
|
assert result["name"][0] == "Victoria Bus Station"
|
|
assert result["category"][0] == "Bus station"
|
|
assert result["lat"][0] == pytest.approx((51.4952 + 51.4953) / 2)
|
|
|
|
|
|
def test_deduplicate_naptan_keeps_rail_station_with_only_station_node():
|
|
# Aberdare's only NaPTAN record is an RLY station node (StopType "RLY").
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["aberdare-rly"],
|
|
"name": ["Aberdare Rail Station"],
|
|
"category": ["Rail station"],
|
|
"lat": [51.7155],
|
|
"lng": [-3.4438],
|
|
"locality": ["ABERDARE"],
|
|
"entrance": [False],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df)
|
|
|
|
assert len(result) == 1
|
|
assert result["name"][0] == "Aberdare Rail Station"
|
|
assert result["category"][0] == "Rail station"
|
|
|
|
|
|
def test_deduplicate_naptan_merges_rail_entrances_into_station_node():
|
|
# A station node (RLY) and its two entrance nodes (RSE) collapse to a single
|
|
# "Rail station" POI represented by the station node, not an entrance.
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["clapham-rly", "clapham-rse-a", "clapham-rse-b"],
|
|
"name": [
|
|
"Clapham Junction Rail Station",
|
|
"Clapham Junction Rail Station",
|
|
"Clapham Junction Rail Station",
|
|
],
|
|
"category": ["Rail station", "Rail station", "Rail station"],
|
|
"lat": [51.4642, 51.4644, 51.4640],
|
|
"lng": [-0.1705, -0.1702, -0.1708],
|
|
"locality": ["CLAPHAM", "CLAPHAM", "CLAPHAM"],
|
|
"entrance": [False, True, True],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df)
|
|
|
|
assert len(result) == 1
|
|
assert result["id"][0] == "clapham-rly"
|
|
assert result["category"][0] == "Rail station"
|
|
|
|
|
|
def test_deduplicate_naptan_does_not_merge_rail_and_ferry_in_same_area():
|
|
# Different transport modes sharing a name/area stay as separate POIs.
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["harbour-rail", "harbour-ferry"],
|
|
"name": ["Harbour Station", "Harbour Station"],
|
|
"category": ["Rail station", "Ferry"],
|
|
"lat": [51.5, 51.5001],
|
|
"lng": [-0.1, -0.1001],
|
|
"locality": ["HARBOUR", "HARBOUR"],
|
|
"entrance": [False, False],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df).sort("category")
|
|
|
|
assert len(result) == 2
|
|
assert result["category"].to_list() == ["Ferry", "Rail station"]
|