631 lines
23 KiB
Rust
631 lines
23 KiB
Rust
use std::{fmt::Debug, vec};
|
|
|
|
#[cfg(feature = "serde")]
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::{
|
|
BuiltinTokenizer, CursorPosition, TextWithCursors,
|
|
operation_transformation::{
|
|
DiffError, Operation,
|
|
utils::{cook_operations::cook_operations, elongate_operations::elongate_operations},
|
|
},
|
|
raw_operation::RawOperation,
|
|
tokenizer::Tokenizer,
|
|
types::{
|
|
history::History, number_or_text::NumberOrText, side::Side,
|
|
span_with_history::SpanWithHistory,
|
|
},
|
|
utils::string_builder::StringBuilder,
|
|
};
|
|
|
|
/// A text document with a sequence of operations derived from diffing it
|
|
/// against an updated version. Supports merging two `EditedText` instances
|
|
/// (from the same original) via Operational Transformation.
|
|
///
|
|
/// Created via `from_strings`, `from_strings_with_tokenizer`, or `from_diff`,
|
|
/// then merged with another `EditedText` and applied to get the reconciled
|
|
/// text.
|
|
///
|
|
/// Also tracks cursor positions from the updated text, repositioning them
|
|
/// when operations are applied.
|
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
#[derive(Debug, Clone, PartialEq, Default)]
|
|
pub struct EditedText<'a, T>
|
|
where
|
|
T: PartialEq + Clone + Debug,
|
|
{
|
|
text: &'a str,
|
|
operations: Vec<Operation<T>>,
|
|
operation_sides: Vec<Side>,
|
|
cursors: Vec<CursorPosition>,
|
|
}
|
|
|
|
impl<'a> EditedText<'a, String> {
|
|
/// Create an `EditedText` from the given original and updated strings.
|
|
/// Uses the default word tokenizer (splits on word boundaries).
|
|
#[must_use]
|
|
pub fn from_strings(original: &'a str, updated: &TextWithCursors) -> Self {
|
|
Self::from_strings_with_tokenizer(original, updated, &*BuiltinTokenizer::Word)
|
|
}
|
|
}
|
|
|
|
impl<'a, T> EditedText<'a, T>
|
|
where
|
|
T: PartialEq + Clone + Debug,
|
|
{
|
|
/// Create an `EditedText` from the given original and updated strings
|
|
/// using the provided tokenizer.
|
|
pub fn from_strings_with_tokenizer(
|
|
original: &'a str,
|
|
updated: &TextWithCursors,
|
|
tokenizer: &Tokenizer<T>,
|
|
) -> Self {
|
|
let original_tokens = (tokenizer)(original);
|
|
let updated_tokens = (tokenizer)(&updated.text());
|
|
|
|
let diff: Vec<RawOperation<T>> = RawOperation::vec_from(&original_tokens, &updated_tokens);
|
|
let operations: Vec<Operation<T>> = cook_operations(elongate_operations(diff)).collect();
|
|
let operation_count = operations.len();
|
|
|
|
Self::new(
|
|
original,
|
|
operations,
|
|
vec![Side::Left; operation_count],
|
|
updated.cursors(),
|
|
)
|
|
}
|
|
|
|
/// Create a new `EditedText` with the given operations.
|
|
/// The operations must be in the order in which they are meant to be
|
|
/// applied. The operations must not overlap.
|
|
fn new(
|
|
text: &'a str,
|
|
operations: Vec<Operation<T>>,
|
|
operation_sides: Vec<Side>,
|
|
mut cursors: Vec<CursorPosition>,
|
|
) -> Self {
|
|
cursors.sort_by_key(|cursor| cursor.char_index);
|
|
|
|
Self {
|
|
text,
|
|
operations,
|
|
operation_sides,
|
|
cursors,
|
|
}
|
|
}
|
|
|
|
/// Merge two `EditedText` instances. The two instances must be derived
|
|
/// from the same original text. The operations are merged using the
|
|
/// principles of Operational Transformation. The cursors are updated
|
|
/// accordingly to reflect the changes made by the merged operations.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// Panics if there's an integer overflow (in isize) when calculating new
|
|
/// cursor positions.
|
|
#[must_use]
|
|
#[allow(clippy::too_many_lines)]
|
|
pub fn merge(self, other: Self) -> Self {
|
|
debug_assert_eq!(
|
|
self.text, other.text,
|
|
"`EditedText`-s must be derived from the same text to be mergable"
|
|
);
|
|
|
|
let mut merged_cursors = Vec::with_capacity(self.cursors.len() + other.cursors.len());
|
|
let mut left_cursors = self.cursors.into_iter().peekable();
|
|
let mut right_cursors = other.cursors.into_iter().peekable();
|
|
|
|
let mut merged_operations: Vec<Operation<T>> =
|
|
Vec::with_capacity(self.operations.len() + other.operations.len());
|
|
let mut merged_operation_sides: Vec<Side> =
|
|
Vec::with_capacity(self.operations.len() + other.operations.len());
|
|
|
|
let mut left_iter = self.operations.into_iter();
|
|
let mut right_iter = other.operations.into_iter();
|
|
|
|
let mut maybe_left_op = left_iter.next();
|
|
let mut maybe_right_op = right_iter.next();
|
|
|
|
let mut seen_left_length: usize = 0;
|
|
let mut seen_right_length: usize = 0;
|
|
let mut merged_length: usize = 0;
|
|
|
|
let mut last_left_op = None;
|
|
let mut last_right_op = None;
|
|
|
|
loop {
|
|
let (side, operation, mut last_other_op) =
|
|
match (maybe_left_op.clone(), maybe_right_op.clone()) {
|
|
(Some(left_op), Some(right_op)) => {
|
|
if left_op
|
|
.get_sort_key(seen_left_length)
|
|
.partial_cmp(&right_op.get_sort_key(seen_right_length))
|
|
== Some(std::cmp::Ordering::Less)
|
|
{
|
|
(Side::Left, left_op, last_right_op.clone())
|
|
} else {
|
|
(Side::Right, right_op, last_left_op.clone())
|
|
}
|
|
}
|
|
|
|
(Some(left_op), None) => (Side::Left, left_op, last_right_op.clone()),
|
|
(None, Some(right_op)) => (Side::Right, right_op, last_left_op.clone()),
|
|
(None, None) => break,
|
|
};
|
|
|
|
let is_advancing_operation = matches!(
|
|
operation,
|
|
Operation::Insert { .. } | Operation::Equal { .. }
|
|
);
|
|
|
|
let original_length = operation.len();
|
|
let (side, result) = match side {
|
|
Side::Left => {
|
|
let result = operation.merge_operations(&mut last_other_op);
|
|
|
|
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
|
|
let merged_length_signed = isize::try_from(merged_length)
|
|
.expect("merged_length must fit in isize");
|
|
let seen_left_length_signed = isize::try_from(seen_left_length)
|
|
.expect("seen_left_length must fit in isize");
|
|
let op_len_signed =
|
|
isize::try_from(op.len()).expect("op.len() must fit in isize");
|
|
let original_length_signed = isize::try_from(original_length)
|
|
.expect("original_length must fit in isize");
|
|
|
|
let shift = merged_length_signed - seen_left_length_signed + op_len_signed
|
|
- original_length_signed;
|
|
|
|
while let Some(cursor) = left_cursors.next_if(|cursor| {
|
|
cursor.char_index <= seen_left_length + original_length
|
|
}) {
|
|
merged_cursors.push(
|
|
cursor.with_index(cursor.char_index.saturating_add_signed(shift)),
|
|
);
|
|
}
|
|
}
|
|
|
|
if is_advancing_operation {
|
|
seen_left_length += original_length;
|
|
}
|
|
|
|
maybe_left_op = left_iter.next();
|
|
last_left_op = Some(result.clone());
|
|
|
|
(Side::Left, result)
|
|
}
|
|
Side::Right => {
|
|
let result = operation.merge_operations(&mut last_other_op);
|
|
|
|
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
|
|
let merged_length_signed = isize::try_from(merged_length)
|
|
.expect("merged_length must fit in isize");
|
|
let seen_right_length_signed = isize::try_from(seen_right_length)
|
|
.expect("seen_right_length must fit in isize");
|
|
let op_len_signed =
|
|
isize::try_from(op.len()).expect("op.len() must fit in isize");
|
|
let original_length_signed = isize::try_from(original_length)
|
|
.expect("original_length must fit in isize");
|
|
|
|
let shift = merged_length_signed - seen_right_length_signed + op_len_signed
|
|
- original_length_signed;
|
|
|
|
while let Some(cursor) = right_cursors.next_if(|cursor| {
|
|
cursor.char_index <= seen_right_length + original_length
|
|
}) {
|
|
merged_cursors.push(
|
|
cursor.with_index(cursor.char_index.saturating_add_signed(shift)),
|
|
);
|
|
}
|
|
}
|
|
|
|
if is_advancing_operation {
|
|
seen_right_length += original_length;
|
|
}
|
|
|
|
maybe_right_op = right_iter.next();
|
|
last_right_op = Some(result.clone());
|
|
|
|
(Side::Right, result)
|
|
}
|
|
};
|
|
|
|
if result.len() == 0 {
|
|
continue;
|
|
}
|
|
|
|
if is_advancing_operation {
|
|
merged_length += result.len();
|
|
}
|
|
|
|
merged_operations.push(result);
|
|
merged_operation_sides.push(side);
|
|
}
|
|
|
|
for cursor in left_cursors.chain(right_cursors) {
|
|
merged_cursors.push(cursor.with_index(merged_length));
|
|
}
|
|
|
|
debug_assert_eq!(merged_operations.len(), merged_operation_sides.len());
|
|
|
|
Self::new(
|
|
self.text,
|
|
merged_operations,
|
|
merged_operation_sides,
|
|
merged_cursors,
|
|
)
|
|
}
|
|
|
|
/// Apply the operations to the text and return the resulting text.
|
|
#[must_use]
|
|
pub fn apply(&self) -> TextWithCursors {
|
|
let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
|
|
|
|
for operation in &self.operations {
|
|
builder = operation.apply(builder);
|
|
}
|
|
|
|
TextWithCursors::new(builder.take(), self.cursors.clone())
|
|
}
|
|
|
|
/// Apply the operations to the text and return the resulting text in chunks
|
|
/// together with the provenance describing where each chunk came from.
|
|
///
|
|
/// Returns all spans including deletions (not present in the merged text).
|
|
///
|
|
/// ```
|
|
/// use reconcile_text::{History, SpanWithHistory, BuiltinTokenizer, reconcile};
|
|
///
|
|
/// let parent = "Merging text is hard!";
|
|
/// let left = "Merging text is easy!"; // Changed "hard" to "easy"
|
|
/// let right = "With reconcile, merging documents is hard!"; // Added prefix and changed word
|
|
///
|
|
/// let result = reconcile(
|
|
/// parent,
|
|
/// &left.into(),
|
|
/// &right.into(),
|
|
/// &*BuiltinTokenizer::Word,
|
|
/// );
|
|
///
|
|
/// assert_eq!(
|
|
/// result.apply_with_history(),
|
|
/// vec![
|
|
/// SpanWithHistory::new("Merging text".to_string(), History::RemovedFromRight,),
|
|
/// SpanWithHistory::new(
|
|
/// "With reconcile, merging documents".to_string(),
|
|
/// History::AddedFromRight,
|
|
/// ),
|
|
/// SpanWithHistory::new(" ".to_string(), History::Unchanged,),
|
|
/// SpanWithHistory::new("is".to_string(), History::Unchanged,),
|
|
/// SpanWithHistory::new(" hard!".to_string(), History::RemovedFromLeft,),
|
|
/// SpanWithHistory::new(" easy!".to_string(), History::AddedFromLeft,),
|
|
/// ]
|
|
/// );
|
|
/// ```
|
|
#[must_use]
|
|
pub fn apply_with_history(&self) -> Vec<SpanWithHistory> {
|
|
let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
|
|
|
|
let mut history = Vec::with_capacity(self.operations.len());
|
|
|
|
for (operation, side) in self.operations.iter().zip(self.operation_sides.iter()) {
|
|
builder = operation.apply(builder);
|
|
|
|
match operation {
|
|
Operation::Equal { .. } => {
|
|
history.push(SpanWithHistory::new(builder.take(), History::Unchanged));
|
|
}
|
|
Operation::Insert { .. } => match side {
|
|
Side::Left => {
|
|
history.push(SpanWithHistory::new(builder.take(), History::AddedFromLeft));
|
|
}
|
|
Side::Right => history.push(SpanWithHistory::new(
|
|
builder.take(),
|
|
History::AddedFromRight,
|
|
)),
|
|
},
|
|
Operation::Delete {
|
|
deleted_character_count,
|
|
order,
|
|
..
|
|
} => {
|
|
let deleted: String = self
|
|
.text
|
|
.chars()
|
|
.skip(*order)
|
|
.take(*deleted_character_count)
|
|
.collect();
|
|
match side {
|
|
Side::Left => {
|
|
history.push(SpanWithHistory::new(deleted, History::RemovedFromLeft));
|
|
}
|
|
Side::Right => {
|
|
history.push(SpanWithHistory::new(deleted, History::RemovedFromRight));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
history
|
|
}
|
|
|
|
/// Convert the `EditedText` into a terse representation ready for
|
|
/// serialization. The result omits cursor positions and the original text.
|
|
/// This is useful for sending text diffs over the network if there's a
|
|
/// clear consensus on the original text.
|
|
///
|
|
/// Inserts are represented as strings, deletes as negative integers,
|
|
/// and equal spans as positive integers.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// Panics if there's an integer overflow in i64.
|
|
#[must_use]
|
|
pub fn to_diff(&self) -> Vec<NumberOrText> {
|
|
let mut result: Vec<NumberOrText> = Vec::with_capacity(self.operations.len());
|
|
let mut previous_equal: Option<usize> = None;
|
|
|
|
for operation in &self.operations {
|
|
match operation {
|
|
Operation::Equal { length, .. } => {
|
|
if let Some(prev_length) = previous_equal {
|
|
previous_equal = Some(prev_length + *length);
|
|
} else {
|
|
previous_equal = Some(*length);
|
|
}
|
|
}
|
|
|
|
Operation::Insert { text, .. } => {
|
|
if let Some(prev_length) = previous_equal {
|
|
result.push(NumberOrText::Number(
|
|
i64::try_from(prev_length).expect("prev_length must fit in i64"),
|
|
));
|
|
previous_equal = None;
|
|
}
|
|
|
|
let text: String = text
|
|
.iter()
|
|
.map(super::super::tokenizer::token::Token::original)
|
|
.collect();
|
|
result.push(NumberOrText::Text(text));
|
|
}
|
|
|
|
Operation::Delete {
|
|
deleted_character_count,
|
|
..
|
|
} => {
|
|
if let Some(prev_length) = previous_equal {
|
|
result.push(NumberOrText::Number(
|
|
i64::try_from(prev_length).expect("prev_length must fit in i64"),
|
|
));
|
|
previous_equal = None;
|
|
}
|
|
|
|
let count = i64::try_from(*deleted_character_count)
|
|
.expect("deleted_character_count must fit in i64");
|
|
result.push(NumberOrText::Number(-count));
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(prev_length) = previous_equal {
|
|
result.push(NumberOrText::Number(
|
|
i64::try_from(prev_length).expect("prev_length must fit in i64"),
|
|
));
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Reconstruct an `EditedText` from a diff and the original text.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Returns `DiffError::LengthExceedsOriginal` if the diff references a
|
|
/// range that exceeds the original text length.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// Panics if there's an integer overflow in i64.
|
|
pub fn from_diff(
|
|
original_text: &'a str,
|
|
diff: Vec<NumberOrText>,
|
|
tokenizer: &Tokenizer<T>,
|
|
) -> Result<EditedText<'a, T>, DiffError> {
|
|
let mut operations: Vec<Operation<T>> = Vec::with_capacity(diff.len());
|
|
let mut order = 0;
|
|
let text_length = original_text.chars().count();
|
|
|
|
for item in diff {
|
|
match item {
|
|
NumberOrText::Number(length) => {
|
|
if length >= 0 {
|
|
let length = usize::try_from(length).expect("length must fit in usize");
|
|
|
|
// Validate that the range doesn't exceed the original text
|
|
if order + length > text_length {
|
|
return Err(DiffError::LengthExceedsOriginal {
|
|
position: order,
|
|
requested: length,
|
|
available: text_length.saturating_sub(order),
|
|
});
|
|
}
|
|
|
|
let original_characters: String =
|
|
original_text.chars().skip(order).take(length).collect();
|
|
|
|
let original_tokens = tokenizer(&original_characters);
|
|
for token in original_tokens {
|
|
operations
|
|
.push(Operation::create_equal(order, token.get_original_length()));
|
|
order += token.get_original_length();
|
|
}
|
|
} else {
|
|
let length =
|
|
usize::try_from(-length).expect("negative length must fit in usize");
|
|
|
|
// Validate that the delete range doesn't exceed the original text
|
|
if order + length > text_length {
|
|
return Err(DiffError::LengthExceedsOriginal {
|
|
position: order,
|
|
requested: length,
|
|
available: text_length.saturating_sub(order),
|
|
});
|
|
}
|
|
|
|
operations.push(Operation::create_delete(order, length));
|
|
order += length;
|
|
}
|
|
}
|
|
NumberOrText::Text(text) => {
|
|
let tokens = tokenizer(&text);
|
|
operations.push(Operation::create_insert(order, tokens));
|
|
}
|
|
}
|
|
}
|
|
|
|
let operation_count = operations.len();
|
|
Ok(EditedText::new(
|
|
original_text,
|
|
operations,
|
|
vec![Side::Left; operation_count],
|
|
vec![],
|
|
))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use insta::assert_debug_snapshot;
|
|
use pretty_assertions::assert_eq;
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_calculate_operations() {
|
|
let left = "hello world! How are you? Adam";
|
|
let right = "Hello, my friend! How are you doing? Albert";
|
|
|
|
let operations = EditedText::from_strings(left, &right.into());
|
|
|
|
insta::assert_debug_snapshot!(operations);
|
|
|
|
let new_right = operations.apply();
|
|
assert_eq!(new_right.text(), right);
|
|
}
|
|
|
|
#[test]
|
|
fn test_calculate_operations_with_no_diff() {
|
|
let text = "hello world!";
|
|
|
|
let operations = EditedText::from_strings(text, &text.into());
|
|
|
|
assert_debug_snapshot!(operations);
|
|
|
|
let new_right = operations.apply();
|
|
assert_eq!(new_right.text(), text);
|
|
}
|
|
|
|
#[test]
|
|
fn test_calculate_operations_with_insert() {
|
|
let original = "hello world! ...";
|
|
let left = "Hello world! I'm Andras.";
|
|
let right = "Hello world! How are you?";
|
|
let expected = "Hello world! How are you? I'm Andras.";
|
|
|
|
let operations_1 = EditedText::from_strings(original, &left.into());
|
|
let operations_2 = EditedText::from_strings(original, &right.into());
|
|
|
|
let operations = operations_1.merge(operations_2);
|
|
assert_eq!(operations.apply().text(), expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_from_diff_length_exceeds_original() {
|
|
let result = EditedText::from_diff(
|
|
"hello",
|
|
vec![
|
|
10.into(), // too large equal span - should error
|
|
" world".into(),
|
|
],
|
|
&*BuiltinTokenizer::Word,
|
|
);
|
|
|
|
assert!(result.is_err());
|
|
match result {
|
|
Err(DiffError::LengthExceedsOriginal {
|
|
position,
|
|
requested,
|
|
available,
|
|
}) => {
|
|
assert_eq!(position, 0);
|
|
assert_eq!(requested, 10);
|
|
assert_eq!(available, 5);
|
|
}
|
|
_ => panic!("Expected LengthExceedsOriginal error"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_from_diff_valid() {
|
|
let edited_text = EditedText::from_diff(
|
|
"hello",
|
|
vec![
|
|
5.into(), // exact length
|
|
" world".into(),
|
|
],
|
|
&*BuiltinTokenizer::Word,
|
|
)
|
|
.unwrap();
|
|
|
|
let content = edited_text.apply().text();
|
|
|
|
assert_eq!(content, "hello world");
|
|
}
|
|
|
|
#[cfg(feature = "serde")]
|
|
#[test]
|
|
fn test_changes_deserialisation() {
|
|
let original = "Merging text is hard!";
|
|
let changes = "Merging text is easy with reconcile!";
|
|
let result = EditedText::from_strings(original, &changes.into());
|
|
let serialized = serde_yaml::to_string(&result.to_diff()).unwrap();
|
|
|
|
let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",);
|
|
assert_eq!(serialized, expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_apply_with_history_utf8() {
|
|
let parent = "こんにちは世界"; // "Hello World" in Japanese (7 chars, 21 bytes)
|
|
let left = "こんにちは宇宙"; // Changed 世界 to 宇宙
|
|
let right = parent;
|
|
|
|
let result = crate::reconcile(
|
|
parent,
|
|
&left.into(),
|
|
&right.into(),
|
|
&*BuiltinTokenizer::Word,
|
|
);
|
|
|
|
let history = result.apply_with_history();
|
|
assert!(!history.is_empty());
|
|
assert_eq!(result.apply().text(), "こんにちは宇宙");
|
|
}
|
|
|
|
#[cfg(feature = "serde")]
|
|
#[test]
|
|
fn test_changes_serialization() {
|
|
let original = "The quick brown fox jumps over the lazy dog.";
|
|
let updated = "The quick red fox jumped over the very lazy dog!";
|
|
|
|
let edited_text = EditedText::from_strings(original, &updated.into());
|
|
|
|
let changes = edited_text.to_diff();
|
|
let deserialized_edited_text =
|
|
EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word).unwrap();
|
|
|
|
assert_eq!(deserialized_edited_text.apply().text(), updated);
|
|
}
|
|
}
|