Add character tokenizer
This commit is contained in:
parent
4fda83fe17
commit
9cb73680f8
4 changed files with 177 additions and 1 deletions
|
|
@ -1,4 +1,5 @@
|
|||
mod word_tokenizer;
|
||||
mod character_tokenizer;
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
|
|
@ -35,7 +36,7 @@ impl Deref for BuiltinTokenizer {
|
|||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BuiltinTokenizer::Character => todo!(),
|
||||
BuiltinTokenizer::Character =>&character_tokenizer::character_tokenizer,
|
||||
BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer,
|
||||
#[cfg(feature = "wasm")]
|
||||
BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"),
|
||||
|
|
|
|||
26
src/tokenizer/character_tokenizer.rs
Normal file
26
src/tokenizer/character_tokenizer.rs
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
use super::token::Token;
|
||||
|
||||
/// Splits text into UTF-8 characters.
|
||||
///
|
||||
/// ```not_rust
|
||||
/// "Hey!" -> ["H", "e", "y", "!"]
|
||||
/// ```
|
||||
pub fn character_tokenizer(text: &str) -> Vec<Token<String>> {
|
||||
text.chars()
|
||||
.map(|char| Token::new(char.to_string(), char.to_string(), true, true))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_with_snapshots() {
|
||||
assert_debug_snapshot!(character_tokenizer(""));
|
||||
|
||||
assert_debug_snapshot!(character_tokenizer(" hello, \nwhere are you?"));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
---
|
||||
source: src/tokenizer/character_tokenizer.rs
|
||||
expression: "character_tokenizer(\" hello, \\nwhere are you?\")"
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: " ",
|
||||
original: " ",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "h",
|
||||
original: "h",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "e",
|
||||
original: "e",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "l",
|
||||
original: "l",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "l",
|
||||
original: "l",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "o",
|
||||
original: "o",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: ",",
|
||||
original: ",",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: " ",
|
||||
original: " ",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "w",
|
||||
original: "w",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "h",
|
||||
original: "h",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "e",
|
||||
original: "e",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "r",
|
||||
original: "r",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "e",
|
||||
original: "e",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: " ",
|
||||
original: " ",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "r",
|
||||
original: "r",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "e",
|
||||
original: "e",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: " ",
|
||||
original: " ",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "y",
|
||||
original: "y",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "o",
|
||||
original: "o",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "u",
|
||||
original: "u",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "?",
|
||||
original: "?",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
source: src/tokenizer/character_tokenizer.rs
|
||||
expression: "character_tokenizer(\"\")"
|
||||
---
|
||||
[]
|
||||
Loading…
Add table
Add a link
Reference in a new issue