From 9cb73680f8188149191003724486c94370a8730a Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 29 Jun 2025 19:35:21 +0100 Subject: [PATCH] Add character tokenizer --- src/tokenizer.rs | 3 +- src/tokenizer/character_tokenizer.rs | 26 ++++ ...er_tokenizer__tests__with_snapshots-2.snap | 144 ++++++++++++++++++ ...cter_tokenizer__tests__with_snapshots.snap | 5 + 4 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 src/tokenizer/character_tokenizer.rs create mode 100644 src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots-2.snap create mode 100644 src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots.snap diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 87de8b5..b2b9065 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,4 +1,5 @@ mod word_tokenizer; +mod character_tokenizer; use std::ops::Deref; @@ -35,7 +36,7 @@ impl Deref for BuiltinTokenizer { fn deref(&self) -> &Self::Target { match self { - BuiltinTokenizer::Character => todo!(), + BuiltinTokenizer::Character =>&character_tokenizer::character_tokenizer, BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer, #[cfg(feature = "wasm")] BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"), diff --git a/src/tokenizer/character_tokenizer.rs b/src/tokenizer/character_tokenizer.rs new file mode 100644 index 0000000..ed6170c --- /dev/null +++ b/src/tokenizer/character_tokenizer.rs @@ -0,0 +1,26 @@ +use super::token::Token; + +/// Splits text into UTF-8 characters. +/// +/// ```not_rust +/// "Hey!" -> ["H", "e", "y", "!"] +/// ``` +pub fn character_tokenizer(text: &str) -> Vec> { + text.chars() + .map(|char| Token::new(char.to_string(), char.to_string(), true, true)) + .collect() +} + +#[cfg(test)] +mod tests { + use insta::assert_debug_snapshot; + + use super::*; + + #[test] + fn test_with_snapshots() { + assert_debug_snapshot!(character_tokenizer("")); + + assert_debug_snapshot!(character_tokenizer(" hello, \nwhere are you?")); + } +} diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots-2.snap b/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots-2.snap new file mode 100644 index 0000000..b61d12a --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots-2.snap @@ -0,0 +1,144 @@ +--- +source: src/tokenizer/character_tokenizer.rs +expression: "character_tokenizer(\" hello, \\nwhere are you?\")" +--- +[ + Token { + normalised: " ", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "h", + original: "h", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "e", + original: "e", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "l", + original: "l", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "l", + original: "l", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "o", + original: "o", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: ",", + original: ",", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " ", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "w", + original: "w", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "h", + original: "h", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "e", + original: "e", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "r", + original: "r", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "e", + original: "e", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " ", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "a", + original: "a", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "r", + original: "r", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "e", + original: "e", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " ", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "y", + original: "y", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "o", + original: "o", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "u", + original: "u", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "?", + original: "?", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots.snap b/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots.snap new file mode 100644 index 0000000..9aa9b44 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__character_tokenizer__tests__with_snapshots.snap @@ -0,0 +1,5 @@ +--- +source: src/tokenizer/character_tokenizer.rs +expression: "character_tokenizer(\"\")" +--- +[]