From bf8d00c5e24a5381cfb8246d8f179d78a7e91ee8 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Sun, 2 Mar 2025 17:53:21 +0000
Subject: [PATCH] Fix whitespaces

---
 .../reconcile/src/operation_transformation.rs |  5 +-
 .../operation_transformation/edited_text.rs   |  5 +-
 .../src/operation_transformation/operation.rs | 11 +----
 backend/reconcile/src/tokenizer/token.rs      |  7 +--
 .../reconcile/src/tokenizer/word_tokenizer.rs | 47 +++++++++++++++++--
 5 files changed, 53 insertions(+), 22 deletions(-)
diff --git a/backend/reconcile/src/operation_transformation.rs b/backend/reconcile/src/operation_transformation.rs
index 3f83197a..aa891d72 100644
--- a/backend/reconcile/src/operation_transformation.rs
+++ b/backend/reconcile/src/operation_transformation.rs
@@ -73,7 +73,8 @@ mod test {
             "original_1 edit_1 original_3",
         );
 
-        // One deleted a large range, the other deleted subranges and inserted as well
+        // One deleted a large range, the other deleted subranges and inserted as
+        // well
         test_merge_both_ways(
             "original_1 original_2 original_3 original_4 original_5",
             "original_1 original_5",
@@ -161,6 +162,8 @@ mod test {
             "hi there my friend ",
             "hi there you my friend ",
         );
+
+        test_merge_both_ways("a", "a b c", "a b c d", "a b c d");
     }
 
     #[test_matrix( [
diff --git a/backend/reconcile/src/operation_transformation/edited_text.rs b/backend/reconcile/src/operation_transformation/edited_text.rs
index 4052485c..87a5df40 100644
--- a/backend/reconcile/src/operation_transformation/edited_text.rs
+++ b/backend/reconcile/src/operation_transformation/edited_text.rs
@@ -65,7 +65,6 @@ where
 
         Self::new(
             original,
-            // Self::cook_operations(diff),
             Self::cook_operations(Self::elongate_operations(diff)).collect(),
         )
     }
@@ -191,7 +190,7 @@ where
     pub fn merge(self, other: Self) -> Self {
         debug_assert_eq!(
             self.text, other.text,
-            "EditedText-s must be derived from the same text to be mergable"
+            "`EditedText`-s must be derived from the same text to be mergable"
         );
 
         let mut left_merge_context = MergeContext::default();
@@ -285,7 +284,7 @@ mod tests {
         let original = "hello world! ...";
         let left = "Hello world! I'm Andras.";
         let right = "Hello world! How are you?";
-        let expected = "Hello world! How are you?I'm Andras.";
+        let expected = "Hello world! I'm Andras. How are you?";
 
         let operations_1 = EditedText::from_strings(original, left);
         let operations_2 = EditedText::from_strings(original, right);
diff --git a/backend/reconcile/src/operation_transformation/operation.rs b/backend/reconcile/src/operation_transformation/operation.rs
index a985ad7b..ffc4f7d6 100644
--- a/backend/reconcile/src/operation_transformation/operation.rs
+++ b/backend/reconcile/src/operation_transformation/operation.rs
@@ -107,15 +107,8 @@ where
         })
     }
 
-    /// Tries to apply the operation to the given `ropey::Rope` text, returning
-    /// the modified text.
-    ///
-    /// # Errors
-    ///
-    /// Returns a `SyncLibError::OperationApplicationError` if the operation
-    /// cannot be applied.
-    ///
-    /// # Panics
+    /// Applies the operation to the given `StringBuilder`, returning the
+    /// modified `StringBuilder`.
     ///
     /// When compiled in debug mode, panics if a delete operation is attempted
     /// on a range of text that does not match the text to be deleted.
diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs
index b867bb20..ab521a71 100644
--- a/backend/reconcile/src/tokenizer/token.rs
+++ b/backend/reconcile/src/tokenizer/token.rs
@@ -15,12 +15,7 @@ where
 }
 
 impl From<&str> for Token<String> {
-    fn from(s: &str) -> Self {
-        Token {
-            normalised: s.to_owned(),
-            original: s.to_owned(),
-        }
-    }
+    fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) }
 }
 
 impl<T> Token<T>
diff --git a/backend/reconcile/src/tokenizer/word_tokenizer.rs b/backend/reconcile/src/tokenizer/word_tokenizer.rs
index 3449cba2..37d748b3 100644
--- a/backend/reconcile/src/tokenizer/word_tokenizer.rs
+++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs
@@ -1,7 +1,48 @@
 use super::token::Token;
 
+/// Splits on whitespace keeping the leading whitespace.
+///
+///     
+/// ## Example
+///
+/// "Hi there!" -> ["Hi", " there!"]
 pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
-    text.split_inclusive(char::is_whitespace)
-        .map(|s| Token::new(s.to_owned(), s.to_owned()))
-        .collect()
+    let mut result: Vec<Token<String>> = Vec::new();
+
+    let mut last_whitespace = 0;
+    let mut previous_char_is_whitespace = true;
+
+    for (i, c) in text.char_indices() {
+        let is_current_char_whitespace = c.is_whitespace();
+        if !previous_char_is_whitespace && is_current_char_whitespace {
+            result.push(text[last_whitespace..i].into());
+            last_whitespace = i;
+        }
+
+        previous_char_is_whitespace = is_current_char_whitespace;
+    }
+
+    if last_whitespace < text.len() {
+        result.push(text[last_whitespace..].into());
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_with_snapshots() {
+        assert_debug_snapshot!(word_tokenizer("Hi there!"));
+
+        assert_debug_snapshot!(word_tokenizer(""));
+
+        assert_debug_snapshot!(word_tokenizer(" what? "));
+
+        assert_debug_snapshot!(word_tokenizer(" hello, \nwhere are you?"));
+    }
 }