Expose word_tokenizer

2025-06-14 11:44:06 +01:00 · 2025-06-14 11:44:06 +01:00 · 1038f9cee0
commit 1038f9cee0
parent 744decb92f
4 changed files with 6 additions and 5 deletions
--- a/backend/reconcile/src/lib.rs
+++ b/backend/reconcile/src/lib.rs
@ -7,4 +7,4 @@ pub use operation_transformation::{
    CursorPosition, EditedText, TextWithCursors, reconcile, reconcile_with_cursors,
    reconcile_with_tokenizer,
 };
-pub use tokenizer::{Tokenizer, token::Token};
+pub use tokenizer::{Tokenizer, token::Token, word_tokenizer::word_tokenizer};
--- a/backend/reconcile/src/tokenizer.rs
+++ b/backend/reconcile/src/tokenizer.rs
@ -3,4 +3,5 @@ use token::Token;
 pub mod token;
 pub mod word_tokenizer;

+/// A trait for tokenizers that take a string and return a list of tokens.
 pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
--- a/backend/reconcile/src/tokenizer/token.rs
+++ b/backend/reconcile/src/tokenizer/token.rs
@ -3,11 +3,11 @@ use serde::{Deserialize, Serialize};

 /// A token is a string that has been normalised in some way.
 ///
-/// It's UTF-8 compatible.
-///
 /// A token consists of the normalised form is used for comparison, and the
 /// original form used for subsequently applying `Operation`-s to a text
 /// document.
+///
+/// It's UTF-8 compatible.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
 pub struct Token<T>
--- a/backend/reconcile/src/tokenizer/word_tokenizer.rs
+++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs
@ -1,7 +1,7 @@
 use super::token::Token;

-/// Splits on word boundaries creating alternating words and whitespaces with
-/// the whitespaces getting unique IDs.
+/// Splits text on word boundaries creating tokens of alternating words and
+/// whitespaces with the whitespaces getting unique IDs.
 ///
 /// ## Example
 ///