From 1038f9cee08d3afba5dbd2569f6036ab617e62c3 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Sat, 14 Jun 2025 11:44:06 +0100
Subject: [PATCH] Expose word_tokenizer

---
 backend/reconcile/src/lib.rs                      | 2 +-
 backend/reconcile/src/tokenizer.rs                | 1 +
 backend/reconcile/src/tokenizer/token.rs          | 4 ++--
 backend/reconcile/src/tokenizer/word_tokenizer.rs | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/backend/reconcile/src/lib.rs b/backend/reconcile/src/lib.rs
index a04ae85..c621ffb 100644
--- a/backend/reconcile/src/lib.rs
+++ b/backend/reconcile/src/lib.rs
@@ -7,4 +7,4 @@ pub use operation_transformation::{
     CursorPosition, EditedText, TextWithCursors, reconcile, reconcile_with_cursors,
     reconcile_with_tokenizer,
 };
-pub use tokenizer::{Tokenizer, token::Token};
+pub use tokenizer::{Tokenizer, token::Token, word_tokenizer::word_tokenizer};
diff --git a/backend/reconcile/src/tokenizer.rs b/backend/reconcile/src/tokenizer.rs
index 7ce6463..608fe93 100644
--- a/backend/reconcile/src/tokenizer.rs
+++ b/backend/reconcile/src/tokenizer.rs
@@ -3,4 +3,5 @@ use token::Token;
 pub mod token;
 pub mod word_tokenizer;
 
+/// A trait for tokenizers that take a string and return a list of tokens.
 pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs
index 23504e7..0c12770 100644
--- a/backend/reconcile/src/tokenizer/token.rs
+++ b/backend/reconcile/src/tokenizer/token.rs
@@ -3,11 +3,11 @@ use serde::{Deserialize, Serialize};
 
 /// A token is a string that has been normalised in some way.
 ///
-/// It's UTF-8 compatible.
-///
 /// A token consists of the normalised form is used for comparison, and the
 /// original form used for subsequently applying `Operation`-s to a text
 /// document.
+///
+/// It's UTF-8 compatible.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
 pub struct Token<T>
diff --git a/backend/reconcile/src/tokenizer/word_tokenizer.rs b/backend/reconcile/src/tokenizer/word_tokenizer.rs
index 46faa42..61c3fa3 100644
--- a/backend/reconcile/src/tokenizer/word_tokenizer.rs
+++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs
@@ -1,7 +1,7 @@
 use super::token::Token;
 
-/// Splits on word boundaries creating alternating words and whitespaces with
-/// the whitespaces getting unique IDs.
+/// Splits text on word boundaries creating tokens of alternating words and
+/// whitespaces with the whitespaces getting unique IDs.
 ///
 /// ## Example
 ///