use std::{fmt::Debug, vec}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::{ BuiltinTokenizer, CursorPosition, TextWithCursors, operation_transformation::{ DiffError, Operation, utils::{cook_operations::cook_operations, elongate_operations::elongate_operations}, }, raw_operation::RawOperation, tokenizer::Tokenizer, types::{ history::History, number_or_text::NumberOrText, side::Side, span_with_history::SpanWithHistory, }, utils::string_builder::StringBuilder, }; /// A text document with a sequence of operations derived from diffing it /// against an updated version. Supports merging two `EditedText` instances /// (from the same original) via Operational Transformation. /// /// Created via `from_strings`, `from_strings_with_tokenizer`, or `from_diff`, /// then merged with another `EditedText` and applied to get the reconciled /// text. /// /// Also tracks cursor positions from the updated text, repositioning them /// when operations are applied. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone, PartialEq, Default)] pub struct EditedText<'a, T> where T: PartialEq + Clone + Debug, { text: &'a str, operations: Vec>, operation_sides: Vec, cursors: Vec, } impl<'a> EditedText<'a, String> { /// Create an `EditedText` from the given original and updated strings. /// Uses the default word tokenizer (splits on word boundaries). #[must_use] pub fn from_strings(original: &'a str, updated: &TextWithCursors) -> Self { Self::from_strings_with_tokenizer(original, updated, &*BuiltinTokenizer::Word) } } impl<'a, T> EditedText<'a, T> where T: PartialEq + Clone + Debug, { /// Create an `EditedText` from the given original and updated strings /// using the provided tokenizer. pub fn from_strings_with_tokenizer( original: &'a str, updated: &TextWithCursors, tokenizer: &Tokenizer, ) -> Self { let original_tokens = (tokenizer)(original); let updated_tokens = (tokenizer)(&updated.text()); let diff: Vec> = RawOperation::vec_from(&original_tokens, &updated_tokens); let operations: Vec> = cook_operations(elongate_operations(diff)).collect(); let operation_count = operations.len(); Self::new( original, operations, vec![Side::Left; operation_count], updated.cursors(), ) } /// Create a new `EditedText` with the given operations. /// The operations must be in the order in which they are meant to be /// applied. The operations must not overlap. fn new( text: &'a str, operations: Vec>, operation_sides: Vec, mut cursors: Vec, ) -> Self { cursors.sort_by_key(|cursor| cursor.char_index); Self { text, operations, operation_sides, cursors, } } /// Merge two `EditedText` instances. The two instances must be derived /// from the same original text. The operations are merged using the /// principles of Operational Transformation. The cursors are updated /// accordingly to reflect the changes made by the merged operations. /// /// # Panics /// /// Panics if there's an integer overflow (in isize) when calculating new /// cursor positions. #[must_use] #[allow(clippy::too_many_lines)] pub fn merge(self, other: Self) -> Self { debug_assert_eq!( self.text, other.text, "`EditedText`-s must be derived from the same text to be mergable" ); let mut merged_cursors = Vec::with_capacity(self.cursors.len() + other.cursors.len()); let mut left_cursors = self.cursors.into_iter().peekable(); let mut right_cursors = other.cursors.into_iter().peekable(); let mut merged_operations: Vec> = Vec::with_capacity(self.operations.len() + other.operations.len()); let mut merged_operation_sides: Vec = Vec::with_capacity(self.operations.len() + other.operations.len()); let mut left_iter = self.operations.into_iter(); let mut right_iter = other.operations.into_iter(); let mut maybe_left_op = left_iter.next(); let mut maybe_right_op = right_iter.next(); let mut seen_left_length: usize = 0; let mut seen_right_length: usize = 0; let mut merged_length: usize = 0; let mut last_left_op = None; let mut last_right_op = None; loop { let (side, operation, mut last_other_op) = match (maybe_left_op.clone(), maybe_right_op.clone()) { (Some(left_op), Some(right_op)) => { if left_op .get_sort_key(seen_left_length) .partial_cmp(&right_op.get_sort_key(seen_right_length)) == Some(std::cmp::Ordering::Less) { (Side::Left, left_op, last_right_op.clone()) } else { (Side::Right, right_op, last_left_op.clone()) } } (Some(left_op), None) => (Side::Left, left_op, last_right_op.clone()), (None, Some(right_op)) => (Side::Right, right_op, last_left_op.clone()), (None, None) => break, }; let is_advancing_operation = matches!( operation, Operation::Insert { .. } | Operation::Equal { .. } ); let original_length = operation.len(); let (side, result) = match side { Side::Left => { let result = operation.merge_operations(&mut last_other_op); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { let merged_length_signed = isize::try_from(merged_length) .expect("merged_length must fit in isize"); let seen_left_length_signed = isize::try_from(seen_left_length) .expect("seen_left_length must fit in isize"); let op_len_signed = isize::try_from(op.len()).expect("op.len() must fit in isize"); let original_length_signed = isize::try_from(original_length) .expect("original_length must fit in isize"); let shift = merged_length_signed - seen_left_length_signed + op_len_signed - original_length_signed; while let Some(cursor) = left_cursors.next_if(|cursor| { cursor.char_index <= seen_left_length + original_length }) { merged_cursors.push( cursor.with_index(cursor.char_index.saturating_add_signed(shift)), ); } } if is_advancing_operation { seen_left_length += original_length; } maybe_left_op = left_iter.next(); last_left_op = Some(result.clone()); (Side::Left, result) } Side::Right => { let result = operation.merge_operations(&mut last_other_op); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { let merged_length_signed = isize::try_from(merged_length) .expect("merged_length must fit in isize"); let seen_right_length_signed = isize::try_from(seen_right_length) .expect("seen_right_length must fit in isize"); let op_len_signed = isize::try_from(op.len()).expect("op.len() must fit in isize"); let original_length_signed = isize::try_from(original_length) .expect("original_length must fit in isize"); let shift = merged_length_signed - seen_right_length_signed + op_len_signed - original_length_signed; while let Some(cursor) = right_cursors.next_if(|cursor| { cursor.char_index <= seen_right_length + original_length }) { merged_cursors.push( cursor.with_index(cursor.char_index.saturating_add_signed(shift)), ); } } if is_advancing_operation { seen_right_length += original_length; } maybe_right_op = right_iter.next(); last_right_op = Some(result.clone()); (Side::Right, result) } }; if result.len() == 0 { continue; } if is_advancing_operation { merged_length += result.len(); } merged_operations.push(result); merged_operation_sides.push(side); } for cursor in left_cursors.chain(right_cursors) { merged_cursors.push(cursor.with_index(merged_length)); } debug_assert_eq!(merged_operations.len(), merged_operation_sides.len()); Self::new( self.text, merged_operations, merged_operation_sides, merged_cursors, ) } /// Apply the operations to the text and return the resulting text. #[must_use] pub fn apply(&self) -> TextWithCursors { let mut builder: StringBuilder<'_> = StringBuilder::new(self.text); for operation in &self.operations { builder = operation.apply(builder); } TextWithCursors::new(builder.take(), self.cursors.clone()) } /// Apply the operations to the text and return the resulting text in chunks /// together with the provenance describing where each chunk came from. /// /// Returns all spans including deletions (not present in the merged text). /// /// ``` /// use reconcile_text::{History, SpanWithHistory, BuiltinTokenizer, reconcile}; /// /// let parent = "Merging text is hard!"; /// let left = "Merging text is easy!"; // Changed "hard" to "easy" /// let right = "With reconcile, merging documents is hard!"; // Added prefix and changed word /// /// let result = reconcile( /// parent, /// &left.into(), /// &right.into(), /// &*BuiltinTokenizer::Word, /// ); /// /// assert_eq!( /// result.apply_with_history(), /// vec![ /// SpanWithHistory::new("Merging text".to_string(), History::RemovedFromRight,), /// SpanWithHistory::new( /// "With reconcile, merging documents".to_string(), /// History::AddedFromRight, /// ), /// SpanWithHistory::new(" ".to_string(), History::Unchanged,), /// SpanWithHistory::new("is".to_string(), History::Unchanged,), /// SpanWithHistory::new(" hard!".to_string(), History::RemovedFromLeft,), /// SpanWithHistory::new(" easy!".to_string(), History::AddedFromLeft,), /// ] /// ); /// ``` #[must_use] pub fn apply_with_history(&self) -> Vec { let mut builder: StringBuilder<'_> = StringBuilder::new(self.text); let mut history = Vec::with_capacity(self.operations.len()); for (operation, side) in self.operations.iter().zip(self.operation_sides.iter()) { builder = operation.apply(builder); match operation { Operation::Equal { .. } => { history.push(SpanWithHistory::new(builder.take(), History::Unchanged)); } Operation::Insert { .. } => match side { Side::Left => { history.push(SpanWithHistory::new(builder.take(), History::AddedFromLeft)); } Side::Right => history.push(SpanWithHistory::new( builder.take(), History::AddedFromRight, )), }, Operation::Delete { deleted_character_count, order, .. } => { let deleted: String = self .text .chars() .skip(*order) .take(*deleted_character_count) .collect(); match side { Side::Left => { history.push(SpanWithHistory::new(deleted, History::RemovedFromLeft)); } Side::Right => { history.push(SpanWithHistory::new(deleted, History::RemovedFromRight)); } } } } } history } /// Convert the `EditedText` into a terse representation ready for /// serialization. The result omits cursor positions and the original text. /// This is useful for sending text diffs over the network if there's a /// clear consensus on the original text. /// /// Inserts are represented as strings, deletes as negative integers, /// and equal spans as positive integers. /// /// # Panics /// /// Panics if there's an integer overflow in i64. #[must_use] pub fn to_diff(&self) -> Vec { let mut result: Vec = Vec::with_capacity(self.operations.len()); let mut previous_equal: Option = None; for operation in &self.operations { match operation { Operation::Equal { length, .. } => { if let Some(prev_length) = previous_equal { previous_equal = Some(prev_length + *length); } else { previous_equal = Some(*length); } } Operation::Insert { text, .. } => { if let Some(prev_length) = previous_equal { result.push(NumberOrText::Number( i64::try_from(prev_length).expect("prev_length must fit in i64"), )); previous_equal = None; } let text: String = text .iter() .map(super::super::tokenizer::token::Token::original) .collect(); result.push(NumberOrText::Text(text)); } Operation::Delete { deleted_character_count, .. } => { if let Some(prev_length) = previous_equal { result.push(NumberOrText::Number( i64::try_from(prev_length).expect("prev_length must fit in i64"), )); previous_equal = None; } let count = i64::try_from(*deleted_character_count) .expect("deleted_character_count must fit in i64"); result.push(NumberOrText::Number(-count)); } } } if let Some(prev_length) = previous_equal { result.push(NumberOrText::Number( i64::try_from(prev_length).expect("prev_length must fit in i64"), )); } result } /// Reconstruct an `EditedText` from a diff and the original text. /// /// # Errors /// /// Returns `DiffError::LengthExceedsOriginal` if the diff references a /// range that exceeds the original text length. /// /// # Panics /// /// Panics if there's an integer overflow in i64. pub fn from_diff( original_text: &'a str, diff: Vec, tokenizer: &Tokenizer, ) -> Result, DiffError> { let mut operations: Vec> = Vec::with_capacity(diff.len()); let mut order = 0; let text_length = original_text.chars().count(); for item in diff { match item { NumberOrText::Number(length) => { if length >= 0 { let length = usize::try_from(length).expect("length must fit in usize"); // Validate that the range doesn't exceed the original text if order + length > text_length { return Err(DiffError::LengthExceedsOriginal { position: order, requested: length, available: text_length.saturating_sub(order), }); } let original_characters: String = original_text.chars().skip(order).take(length).collect(); let original_tokens = tokenizer(&original_characters); for token in original_tokens { operations .push(Operation::create_equal(order, token.get_original_length())); order += token.get_original_length(); } } else { let length = usize::try_from(-length).expect("negative length must fit in usize"); // Validate that the delete range doesn't exceed the original text if order + length > text_length { return Err(DiffError::LengthExceedsOriginal { position: order, requested: length, available: text_length.saturating_sub(order), }); } operations.push(Operation::create_delete(order, length)); order += length; } } NumberOrText::Text(text) => { let tokens = tokenizer(&text); operations.push(Operation::create_insert(order, tokens)); } } } let operation_count = operations.len(); Ok(EditedText::new( original_text, operations, vec![Side::Left; operation_count], vec![], )) } } #[cfg(test)] mod tests { use insta::assert_debug_snapshot; use pretty_assertions::assert_eq; use super::*; #[test] fn test_calculate_operations() { let left = "hello world! How are you? Adam"; let right = "Hello, my friend! How are you doing? Albert"; let operations = EditedText::from_strings(left, &right.into()); insta::assert_debug_snapshot!(operations); let new_right = operations.apply(); assert_eq!(new_right.text(), right); } #[test] fn test_calculate_operations_with_no_diff() { let text = "hello world!"; let operations = EditedText::from_strings(text, &text.into()); assert_debug_snapshot!(operations); let new_right = operations.apply(); assert_eq!(new_right.text(), text); } #[test] fn test_calculate_operations_with_insert() { let original = "hello world! ..."; let left = "Hello world! I'm Andras."; let right = "Hello world! How are you?"; let expected = "Hello world! How are you? I'm Andras."; let operations_1 = EditedText::from_strings(original, &left.into()); let operations_2 = EditedText::from_strings(original, &right.into()); let operations = operations_1.merge(operations_2); assert_eq!(operations.apply().text(), expected); } #[test] fn test_from_diff_length_exceeds_original() { let result = EditedText::from_diff( "hello", vec![ 10.into(), // too large equal span - should error " world".into(), ], &*BuiltinTokenizer::Word, ); assert!(result.is_err()); match result { Err(DiffError::LengthExceedsOriginal { position, requested, available, }) => { assert_eq!(position, 0); assert_eq!(requested, 10); assert_eq!(available, 5); } _ => panic!("Expected LengthExceedsOriginal error"), } } #[test] fn test_from_diff_valid() { let edited_text = EditedText::from_diff( "hello", vec![ 5.into(), // exact length " world".into(), ], &*BuiltinTokenizer::Word, ) .unwrap(); let content = edited_text.apply().text(); assert_eq!(content, "hello world"); } #[cfg(feature = "serde")] #[test] fn test_changes_deserialisation() { let original = "Merging text is hard!"; let changes = "Merging text is easy with reconcile!"; let result = EditedText::from_strings(original, &changes.into()); let serialized = serde_yaml::to_string(&result.to_diff()).unwrap(); let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",); assert_eq!(serialized, expected); } #[test] fn test_apply_with_history_utf8() { let parent = "こんにちは世界"; // "Hello World" in Japanese (7 chars, 21 bytes) let left = "こんにちは宇宙"; // Changed 世界 to 宇宙 let right = parent; let result = crate::reconcile( parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word, ); let history = result.apply_with_history(); assert!(!history.is_empty()); assert_eq!(result.apply().text(), "こんにちは宇宙"); } #[cfg(feature = "serde")] #[test] fn test_changes_serialization() { let original = "The quick brown fox jumps over the lazy dog."; let updated = "The quick red fox jumped over the very lazy dog!"; let edited_text = EditedText::from_strings(original, &updated.into()); let changes = edited_text.to_diff(); let deserialized_edited_text = EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word).unwrap(); assert_eq!(deserialized_edited_text.apply().text(), updated); } }