Add character tokenizer

This commit is contained in:
Andras Schmelczer 2025-06-29 19:35:21 +01:00
parent 4fda83fe17
commit 9cb73680f8
4 changed files with 177 additions and 1 deletions

View file

@ -1,4 +1,5 @@
mod word_tokenizer;
mod character_tokenizer;
use std::ops::Deref;
@ -35,7 +36,7 @@ impl Deref for BuiltinTokenizer {
fn deref(&self) -> &Self::Target {
match self {
BuiltinTokenizer::Character => todo!(),
BuiltinTokenizer::Character =>&character_tokenizer::character_tokenizer,
BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer,
#[cfg(feature = "wasm")]
BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"),

View file

@ -0,0 +1,26 @@
use super::token::Token;
/// Splits text into UTF-8 characters.
///
/// ```not_rust
/// "Hey!" -> ["H", "e", "y", "!"]
/// ```
pub fn character_tokenizer(text: &str) -> Vec<Token<String>> {
text.chars()
.map(|char| Token::new(char.to_string(), char.to_string(), true, true))
.collect()
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_with_snapshots() {
assert_debug_snapshot!(character_tokenizer(""));
assert_debug_snapshot!(character_tokenizer(" hello, \nwhere are you?"));
}
}

View file

@ -0,0 +1,144 @@
---
source: src/tokenizer/character_tokenizer.rs
expression: "character_tokenizer(\" hello, \\nwhere are you?\")"
---
[
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "h",
original: "h",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "e",
original: "e",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "l",
original: "l",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "l",
original: "l",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "o",
original: "o",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: ",",
original: ",",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "w",
original: "w",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "h",
original: "h",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "e",
original: "e",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "r",
original: "r",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "e",
original: "e",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "r",
original: "r",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "e",
original: "e",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "y",
original: "y",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "o",
original: "o",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "u",
original: "u",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "?",
original: "?",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,5 @@
---
source: src/tokenizer/character_tokenizer.rs
expression: "character_tokenizer(\"\")"
---
[]