Expose word_tokenizer
This commit is contained in:
parent
744decb92f
commit
1038f9cee0
4 changed files with 6 additions and 5 deletions
|
|
@ -7,4 +7,4 @@ pub use operation_transformation::{
|
|||
CursorPosition, EditedText, TextWithCursors, reconcile, reconcile_with_cursors,
|
||||
reconcile_with_tokenizer,
|
||||
};
|
||||
pub use tokenizer::{Tokenizer, token::Token};
|
||||
pub use tokenizer::{Tokenizer, token::Token, word_tokenizer::word_tokenizer};
|
||||
|
|
|
|||
|
|
@ -3,4 +3,5 @@ use token::Token;
|
|||
pub mod token;
|
||||
pub mod word_tokenizer;
|
||||
|
||||
/// A trait for tokenizers that take a string and return a list of tokens.
|
||||
pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
|
||||
|
|
|
|||
|
|
@ -3,11 +3,11 @@ use serde::{Deserialize, Serialize};
|
|||
|
||||
/// A token is a string that has been normalised in some way.
|
||||
///
|
||||
/// It's UTF-8 compatible.
|
||||
///
|
||||
/// A token consists of the normalised form is used for comparison, and the
|
||||
/// original form used for subsequently applying `Operation`-s to a text
|
||||
/// document.
|
||||
///
|
||||
/// It's UTF-8 compatible.
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Token<T>
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use super::token::Token;
|
||||
|
||||
/// Splits on word boundaries creating alternating words and whitespaces with
|
||||
/// the whitespaces getting unique IDs.
|
||||
/// Splits text on word boundaries creating tokens of alternating words and
|
||||
/// whitespaces with the whitespaces getting unique IDs.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue