From 1038f9cee08d3afba5dbd2569f6036ab617e62c3 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sat, 14 Jun 2025 11:44:06 +0100 Subject: [PATCH] Expose word_tokenizer --- backend/reconcile/src/lib.rs | 2 +- backend/reconcile/src/tokenizer.rs | 1 + backend/reconcile/src/tokenizer/token.rs | 4 ++-- backend/reconcile/src/tokenizer/word_tokenizer.rs | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/reconcile/src/lib.rs b/backend/reconcile/src/lib.rs index a04ae85..c621ffb 100644 --- a/backend/reconcile/src/lib.rs +++ b/backend/reconcile/src/lib.rs @@ -7,4 +7,4 @@ pub use operation_transformation::{ CursorPosition, EditedText, TextWithCursors, reconcile, reconcile_with_cursors, reconcile_with_tokenizer, }; -pub use tokenizer::{Tokenizer, token::Token}; +pub use tokenizer::{Tokenizer, token::Token, word_tokenizer::word_tokenizer}; diff --git a/backend/reconcile/src/tokenizer.rs b/backend/reconcile/src/tokenizer.rs index 7ce6463..608fe93 100644 --- a/backend/reconcile/src/tokenizer.rs +++ b/backend/reconcile/src/tokenizer.rs @@ -3,4 +3,5 @@ use token::Token; pub mod token; pub mod word_tokenizer; +/// A trait for tokenizers that take a string and return a list of tokens. pub type Tokenizer = dyn Fn(&str) -> Vec>; diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs index 23504e7..0c12770 100644 --- a/backend/reconcile/src/tokenizer/token.rs +++ b/backend/reconcile/src/tokenizer/token.rs @@ -3,11 +3,11 @@ use serde::{Deserialize, Serialize}; /// A token is a string that has been normalised in some way. /// -/// It's UTF-8 compatible. -/// /// A token consists of the normalised form is used for comparison, and the /// original form used for subsequently applying `Operation`-s to a text /// document. +/// +/// It's UTF-8 compatible. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] pub struct Token diff --git a/backend/reconcile/src/tokenizer/word_tokenizer.rs b/backend/reconcile/src/tokenizer/word_tokenizer.rs index 46faa42..61c3fa3 100644 --- a/backend/reconcile/src/tokenizer/word_tokenizer.rs +++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs @@ -1,7 +1,7 @@ use super::token::Token; -/// Splits on word boundaries creating alternating words and whitespaces with -/// the whitespaces getting unique IDs. +/// Splits text on word boundaries creating tokens of alternating words and +/// whitespaces with the whitespaces getting unique IDs. /// /// ## Example ///