Merge crates

2025-06-15 11:30:07 +01:00 · 2025-06-15 11:30:07 +01:00 · bcbac03228
commit bcbac03228
parent 82e77eec89
60 changed files with 73 additions and 248 deletions
--- a/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-2.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-2.snap
@ -0,0 +1,6 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\"\")"
+snapshot_kind: text
+---
+[]
--- a/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-3.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-3.snap
@ -0,0 +1,25 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\" what? \")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: " what?",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "what?",
+        original: "what?",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: " ",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-4.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-4.snap
@ -0,0 +1,55 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: " hello,",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "hello,",
+        original: "hello,",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: " \nwhere",
+        original: " \n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "where",
+        original: "where",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: " are",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "are",
+        original: "are",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: " you?",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "you?",
+        original: "you?",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-5.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-5.snap
@ -0,0 +1,39 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: " ",
+        original: " ",
+    },
+    Token {
+        normalised: "hello,",
+        original: "hello,",
+    },
+    Token {
+        normalised: " \n",
+        original: " \n",
+    },
+    Token {
+        normalised: "where",
+        original: "where",
+    },
+    Token {
+        normalised: " ",
+        original: " ",
+    },
+    Token {
+        normalised: "are",
+        original: "are",
+    },
+    Token {
+        normalised: " ",
+        original: " ",
+    },
+    Token {
+        normalised: "you?",
+        original: "you?",
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots.snap
@ -0,0 +1,25 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\"Hi there!\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Hi",
+        original: "Hi",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: " there!",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "there!",
+        original: "there!",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/token.rs
+++ b/src/tokenizer/token.rs
@ -0,0 +1,68 @@
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+/// A token is a string that has been normalised in some way.
+///
+/// A token consists of the normalised form is used for comparison, and the
+/// original form used for subsequently applying `Operation`-s to a text
+/// document.
+///
+/// It's UTF-8 compatible.
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Token<T>
+where
+    T: PartialEq + Clone + std::fmt::Debug,
+{
+    /// The normalised form of the token used deriving the diff.
+    normalised: T,
+
+    /// The original string, that should be inserted or deleted in the document.
+    original: String,
+
+    /// Whether the token is semantically joinable with the previous token.
+    pub is_left_joinable: bool,
+
+    /// Whether the token is semantically joinable with the next token.
+    pub is_right_joinable: bool,
+}
+
+/// Trivial implementation of Token when the normalised form is the same as the
+/// original string.
+impl From<&str> for Token<String> {
+    fn from(text: &str) -> Self { Token::new(text.to_owned(), text.to_owned(), true, true) }
+}
+
+impl<T> Token<T>
+where
+    T: PartialEq + Clone + std::fmt::Debug,
+{
+    pub fn new(
+        normalised: T,
+        original: String,
+        is_left_joinable: bool,
+        is_right_joinable: bool,
+    ) -> Self {
+        Token {
+            normalised,
+            original,
+            is_left_joinable,
+            is_right_joinable,
+        }
+    }
+
+    pub fn original(&self) -> &str { &self.original }
+
+    pub fn set_normalised(&mut self, normalised: T) { self.normalised = normalised; }
+
+    pub fn normalised(&self) -> &T { &self.normalised }
+
+    pub fn get_original_length(&self) -> usize { self.original.chars().count() }
+}
+
+impl<T> PartialEq for Token<T>
+where
+    T: PartialEq + Clone + std::fmt::Debug,
+{
+    fn eq(&self, other: &Self) -> bool { self.normalised == other.normalised }
+}
--- a/src/tokenizer/word_tokenizer.rs
+++ b/src/tokenizer/word_tokenizer.rs
@ -0,0 +1,61 @@
+use super::token::Token;
+
+/// Splits text on word boundaries creating tokens of alternating words and
+/// whitespaces with the whitespaces getting unique IDs.
+///
+/// ## Example
+///
+/// ```not_rust
+/// "Hi there!" -> ["Hi", " ", "there!"]
+/// ```
+pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result: Vec<Token<String>> = Vec::new();
+
+    let mut previous_boundary_index = 0;
+    let mut previous_char_is_whitespace = text.chars().next().is_none_or(char::is_whitespace);
+
+    for (i, c) in text.char_indices() {
+        let is_current_char_whitespace = c.is_whitespace();
+        if previous_char_is_whitespace != is_current_char_whitespace {
+            result.push(text[previous_boundary_index..i].into());
+            previous_boundary_index = i;
+        }
+
+        previous_char_is_whitespace = is_current_char_whitespace;
+    }
+
+    if previous_boundary_index < text.len() {
+        result.push(text[previous_boundary_index..].into());
+    }
+
+    if result.is_empty() {
+        return result;
+    }
+
+    for i in 0..result.len() - 1 {
+        if result[i].original().chars().all(char::is_whitespace) {
+            let normalised = result[i].normalised().to_owned() + result[i + 1].original();
+            result[i].set_normalised(normalised);
+        }
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_with_snapshots() {
+        assert_debug_snapshot!(word_tokenizer("Hi there!"));
+
+        assert_debug_snapshot!(word_tokenizer(""));
+
+        assert_debug_snapshot!(word_tokenizer(" what? "));
+
+        assert_debug_snapshot!(word_tokenizer(" hello, \nwhere are you?"));
+    }
+}