Expose word_tokenizer

This commit is contained in:
Andras Schmelczer 2025-06-14 11:44:06 +01:00
parent 744decb92f
commit 1038f9cee0
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
4 changed files with 6 additions and 5 deletions

View file

@ -7,4 +7,4 @@ pub use operation_transformation::{
CursorPosition, EditedText, TextWithCursors, reconcile, reconcile_with_cursors,
reconcile_with_tokenizer,
};
pub use tokenizer::{Tokenizer, token::Token};
pub use tokenizer::{Tokenizer, token::Token, word_tokenizer::word_tokenizer};

View file

@ -3,4 +3,5 @@ use token::Token;
pub mod token;
pub mod word_tokenizer;
/// A trait for tokenizers that take a string and return a list of tokens.
pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;

View file

@ -3,11 +3,11 @@ use serde::{Deserialize, Serialize};
/// A token is a string that has been normalised in some way.
///
/// It's UTF-8 compatible.
///
/// A token consists of the normalised form is used for comparison, and the
/// original form used for subsequently applying `Operation`-s to a text
/// document.
///
/// It's UTF-8 compatible.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct Token<T>

View file

@ -1,7 +1,7 @@
use super::token::Token;
/// Splits on word boundaries creating alternating words and whitespaces with
/// the whitespaces getting unique IDs.
/// Splits text on word boundaries creating tokens of alternating words and
/// whitespaces with the whitespaces getting unique IDs.
///
/// ## Example
///