From b230d34b884d68ce3e5e23f742aad184f03fee5d Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sat, 5 Apr 2025 13:48:02 +0100 Subject: [PATCH] Add left/right joinability for tokens --- backend/reconcile/src/diffs/raw_operation.rs | 24 ++- ...le__diffs__myers__tests__complex_diff.snap | 12 ++ ...ile__diffs__myers__tests__delete_only.snap | 4 + ...iffs__myers__tests__identical_content.snap | 6 + ...ile__diffs__myers__tests__insert_only.snap | 4 + ...iffs__myers__tests__prefix_and_suffix.snap | 10 ++ .../operation_transformation/edited_text.rs | 142 +++++++++++------- ...rd_tokenizer__tests__with_snapshots-3.snap | 16 +- ...rd_tokenizer__tests__with_snapshots-4.snap | 40 ++++- ...rd_tokenizer__tests__with_snapshots-5.snap | 39 +++++ ...word_tokenizer__tests__with_snapshots.snap | 12 +- backend/reconcile/src/tokenizer/token.rs | 28 +++- .../reconcile/tests/examples/multiline.yml | 51 ++++++- 13 files changed, 313 insertions(+), 75 deletions(-) create mode 100644 backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-5.snap diff --git a/backend/reconcile/src/diffs/raw_operation.rs b/backend/reconcile/src/diffs/raw_operation.rs index 0df48f5d..f95a0349 100644 --- a/backend/reconcile/src/diffs/raw_operation.rs +++ b/backend/reconcile/src/diffs/raw_operation.rs @@ -28,10 +28,26 @@ where pub fn get_original_text(self) -> String { self.tokens().iter().map(Token::original).collect() } - /// Extends the operation with another operation if returning the new - /// operation. Only operations of the same type can be used to extend. - /// If the operations are of different types, returns None. + pub fn is_left_joinable(&self) -> bool { + let first_token = self.tokens().first(); + first_token.map_or(true, |t| t.get_is_left_joinable()) + } + + pub fn is_right_joinable(&self) -> bool { + let last_token = self.tokens().last(); + last_token.map_or(true, |t| t.get_is_right_joinable()) + } + + /// Extends the operation with another operation when it returns Some + /// operation. Only operations of the same type as self can be used to + /// extend self. If the operations are of different types, returns None. pub fn extend(self, other: RawOperation) -> Option> { + debug_assert!( + std::mem::discriminant(&self) == std::mem::discriminant(&other), + "Cannot extend operations of different types. This should have been handled before \ + calling this function." + ); + match (self, other) { (RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some( RawOperation::Insert(tokens1.into_iter().chain(tokens2).collect()), @@ -42,7 +58,7 @@ where (RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some( RawOperation::Equal(tokens1.into_iter().chain(tokens2).collect()), ), - _ => None, + _ => unreachable!("Only operations of the same type can be extended"), } } } diff --git a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__complex_diff.snap b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__complex_diff.snap index 8c89ed35..57ee0865 100644 --- a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__complex_diff.snap +++ b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__complex_diff.snap @@ -9,6 +9,8 @@ snapshot_kind: text Token { normalised: "a", original: "a", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -17,6 +19,8 @@ snapshot_kind: text Token { normalised: "x", original: "x", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -25,6 +29,8 @@ snapshot_kind: text Token { normalised: "b", original: "b", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -33,6 +39,8 @@ snapshot_kind: text Token { normalised: "c", original: "c", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -41,6 +49,8 @@ snapshot_kind: text Token { normalised: "y", original: "y", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -49,6 +59,8 @@ snapshot_kind: text Token { normalised: "d", original: "d", + is_left_joinable: true, + is_right_joinable: true, }, ], ), diff --git a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__delete_only.snap b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__delete_only.snap index f07eb3df..a4598d0e 100644 --- a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__delete_only.snap +++ b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__delete_only.snap @@ -9,10 +9,14 @@ snapshot_kind: text Token { normalised: "a", original: "a", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "b", original: "b", + is_left_joinable: true, + is_right_joinable: true, }, ], ), diff --git a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__identical_content.snap b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__identical_content.snap index a99e2764..2fc3317a 100644 --- a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__identical_content.snap +++ b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__identical_content.snap @@ -9,14 +9,20 @@ snapshot_kind: text Token { normalised: "a", original: "a", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "b", original: "b", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "c", original: "c", + is_left_joinable: true, + is_right_joinable: true, }, ], ), diff --git a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__insert_only.snap b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__insert_only.snap index b32c8ce3..e07d8440 100644 --- a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__insert_only.snap +++ b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__insert_only.snap @@ -9,10 +9,14 @@ snapshot_kind: text Token { normalised: "a", original: "a", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "b", original: "b", + is_left_joinable: true, + is_right_joinable: true, }, ], ), diff --git a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__prefix_and_suffix.snap b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__prefix_and_suffix.snap index 03c8fee2..6b86600d 100644 --- a/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__prefix_and_suffix.snap +++ b/backend/reconcile/src/diffs/snapshots/reconcile__diffs__myers__tests__prefix_and_suffix.snap @@ -9,6 +9,8 @@ snapshot_kind: text Token { normalised: "a", original: "a", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -17,10 +19,14 @@ snapshot_kind: text Token { normalised: "b", original: "b", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "c", original: "c", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -29,6 +35,8 @@ snapshot_kind: text Token { normalised: "x", original: "x", + is_left_joinable: true, + is_right_joinable: true, }, ], ), @@ -37,6 +45,8 @@ snapshot_kind: text Token { normalised: "d", original: "d", + is_left_joinable: true, + is_right_joinable: true, }, ], ), diff --git a/backend/reconcile/src/operation_transformation/edited_text.rs b/backend/reconcile/src/operation_transformation/edited_text.rs index 8fc2ed96..fdaa87fc 100644 --- a/backend/reconcile/src/operation_transformation/edited_text.rs +++ b/backend/reconcile/src/operation_transformation/edited_text.rs @@ -3,15 +3,12 @@ use core::iter; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use super::{CursorPosition, Operation, TextWithCursors}; +use super::{CursorPosition, Operation, TextWithCursors, ordered_operation::OrderedOperation}; use crate::{ diffs::{myers::diff, raw_operation::RawOperation}, operation_transformation::merge_context::MergeContext, tokenizer::{Tokenizer, word_tokenizer::word_tokenizer}, - utils::{ - merge_iters::MergeSorted as _, ordered_operation::OrderedOperation, side::Side, - string_builder::StringBuilder, - }, + utils::{merge_iters::MergeSorted as _, side::Side, string_builder::StringBuilder}, }; /// A sequence of operations that can be applied to a text document. @@ -66,11 +63,93 @@ where Self::new( original, - Self::cook_operations(Self::elongate_operations(diff)).collect(), + Self::cook_operations(Self::elongate_operations(Self::break_up_raw_operations( + diff, + ))) + .collect(), updated.cursors, ) } + fn break_up_raw_operations(raw_operations: I) -> impl Iterator> + where + I: IntoIterator>, + { + raw_operations.into_iter().flat_map(|raw_operation| { + let mut result: Vec> = Vec::new(); + match raw_operation { + RawOperation::Insert(tokens) => { + for token in tokens { + result.push(RawOperation::Insert(vec![token])); + } + } + RawOperation::Delete(tokens) => { + for token in tokens { + result.push(RawOperation::Delete(vec![token])); + } + } + RawOperation::Equal(tokens) => { + for token in tokens { + result.push(RawOperation::Equal(vec![token])); + } + } + } + result.into_iter() + }) + } + + fn elongate_operations(raw_operations: I) -> Vec> + where + I: IntoIterator>, + { + let mut maybe_previous_insert: Option> = None; + let mut maybe_previous_delete: Option> = None; + + let mut result: Vec> = raw_operations + .into_iter() + .flat_map(|next| match next { + RawOperation::Insert(..) => match maybe_previous_insert.take() { + Some(prev) if prev.is_right_joinable() && next.is_left_joinable() => { + maybe_previous_insert = prev.extend(next); + Box::new(iter::empty()) as Box>> + } + prev => { + maybe_previous_insert = Some(next); + Box::new(prev.into_iter()) + } + }, + RawOperation::Delete(..) => match maybe_previous_delete.take() { + Some(prev) if prev.is_right_joinable() && next.is_left_joinable() => { + maybe_previous_delete = prev.extend(next); + Box::new(iter::empty()) as Box>> + } + prev => { + maybe_previous_delete = Some(next); + Box::new(prev.into_iter()) + } + }, + RawOperation::Equal(..) => Box::new( + maybe_previous_insert + .take() + .into_iter() + .chain(maybe_previous_delete.take()) + .chain(iter::once(next)), + ) + as Box>>, + }) + .collect(); + + if let Some(prev) = maybe_previous_insert { + result.push(prev); + } + + if let Some(prev) = maybe_previous_delete { + result.push(prev); + } + + result + } + // Turn raw operations into ordered operations while keeping track of old & new // indexes. fn cook_operations(raw_operations: I) -> impl Iterator> @@ -119,56 +198,6 @@ where }) } - fn elongate_operations(raw_operations: I) -> Vec> - where - I: IntoIterator>, - { - let mut maybe_previous_insert: Option> = None; - let mut maybe_previous_delete: Option> = None; - - let mut result: Vec> = raw_operations - .into_iter() - .flat_map(|next| match next { - RawOperation::Insert(..) => { - if let Some(prev) = maybe_previous_insert.take() { - maybe_previous_insert = prev.extend(next); - } else { - maybe_previous_insert = Some(next); - } - - Box::new(iter::empty()) as Box>> - } - RawOperation::Delete(..) => { - if let Some(prev) = maybe_previous_delete.take() { - maybe_previous_delete = prev.extend(next); - } else { - maybe_previous_delete = Some(next); - } - - Box::new(iter::empty()) as Box>> - } - RawOperation::Equal(..) => Box::new( - maybe_previous_insert - .take() - .into_iter() - .chain(maybe_previous_delete.take()) - .chain(iter::once(next)), - ) - as Box>>, - }) - .collect(); - - if let Some(prev) = maybe_previous_insert { - result.push(prev); - } - - if let Some(prev) = maybe_previous_delete { - result.push(prev); - } - - result - } - /// Create a new `EditedText` with the given operations. /// The operations must be in the order in which they are meant to be /// applied. The operations must not overlap. @@ -225,6 +254,7 @@ where // Operations on the left and right must come in the same order so that // inserts can be merged with other inserts and deletes with deletes. usize::from(matches!(operation.operation, Operation::Delete { .. })), + operation.operation.start_index(), // Make sure that the ordering is deterministic regardless which text // is left or right. match &operation.operation { diff --git a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-3.snap b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-3.snap index 58d749ef..d1c94e1e 100644 --- a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-3.snap +++ b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-3.snap @@ -5,11 +5,21 @@ snapshot_kind: text --- [ Token { - normalised: "what?", - original: " what?", + normalised: " what?", + original: " ", + is_left_joinable: true, + is_right_joinable: true, }, Token { - normalised: "", + normalised: "what?", + original: "what?", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " ", original: " ", + is_left_joinable: true, + is_right_joinable: true, }, ] diff --git a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-4.snap b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-4.snap index 4c28a7f3..6740dbc0 100644 --- a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-4.snap +++ b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-4.snap @@ -4,20 +4,52 @@ expression: "word_tokenizer(\" hello, \\nwhere are you?\")" snapshot_kind: text --- [ + Token { + normalised: " hello,", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, Token { normalised: "hello,", - original: " hello,", + original: "hello,", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " \nwhere", + original: " \n", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "where", - original: " \nwhere", + original: "where", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " are", + original: " ", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "are", - original: " are", + original: "are", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " you?", + original: " ", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "you?", - original: " you?", + original: "you?", + is_left_joinable: true, + is_right_joinable: true, }, ] diff --git a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-5.snap b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-5.snap new file mode 100644 index 00000000..832147ec --- /dev/null +++ b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots-5.snap @@ -0,0 +1,39 @@ +--- +source: reconcile/src/tokenizer/word_tokenizer.rs +expression: "word_tokenizer(\" hello, \\nwhere are you?\")" +snapshot_kind: text +--- +[ + Token { + normalised: " ", + original: " ", + }, + Token { + normalised: "hello,", + original: "hello,", + }, + Token { + normalised: " \n", + original: " \n", + }, + Token { + normalised: "where", + original: "where", + }, + Token { + normalised: " ", + original: " ", + }, + Token { + normalised: "are", + original: "are", + }, + Token { + normalised: " ", + original: " ", + }, + Token { + normalised: "you?", + original: "you?", + }, +] diff --git a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots.snap b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots.snap index 206c7fee..95c8db5f 100644 --- a/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots.snap +++ b/backend/reconcile/src/tokenizer/snapshots/reconcile__tokenizer__word_tokenizer__tests__with_snapshots.snap @@ -7,9 +7,19 @@ snapshot_kind: text Token { normalised: "Hi", original: "Hi", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: " there!", + original: " ", + is_left_joinable: true, + is_right_joinable: true, }, Token { normalised: "there!", - original: " there!", + original: "there!", + is_left_joinable: true, + is_right_joinable: true, }, ] diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs index ab521a71..86cbb92f 100644 --- a/backend/reconcile/src/tokenizer/token.rs +++ b/backend/reconcile/src/tokenizer/token.rs @@ -3,29 +3,45 @@ use serde::{Deserialize, Serialize}; /// A token is a string that has been normalised in some way. /// The normalised form is used for comparison, while the original form is used -/// for applying Operations. +/// for applying `Operation`-s. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] pub struct Token where T: PartialEq + Clone + std::fmt::Debug, { - normalised: T, + /// The normalised form of the token used deriving the diff. + pub normalised: T, + + /// The original string, that should be inserted or deleted in the document. original: String, + + /// Whether the token is joinable with the previous token. + is_left_joinable: bool, + + /// Whether the token is joinable with the next token. + is_right_joinable: bool, } impl From<&str> for Token { - fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) } + fn from(text: &str) -> Self { Token::new(text.to_owned(), text.to_owned(), true, true) } } impl Token where T: PartialEq + Clone + std::fmt::Debug, { - pub fn new(normalised: T, original: String) -> Self { + pub fn new( + normalised: T, + original: String, + is_left_joinable: bool, + is_right_joinable: bool, + ) -> Self { Token { normalised, original, + is_left_joinable, + is_right_joinable, } } @@ -34,6 +50,10 @@ where pub fn normalised(&self) -> &T { &self.normalised } pub fn get_original_length(&self) -> usize { self.original.chars().count() } + + pub fn get_is_left_joinable(&self) -> bool { self.is_left_joinable } + + pub fn get_is_right_joinable(&self) -> bool { self.is_right_joinable } } impl PartialEq for Token diff --git a/backend/reconcile/tests/examples/multiline.yml b/backend/reconcile/tests/examples/multiline.yml index c751feb9..00de7cd9 100644 --- a/backend/reconcile/tests/examples/multiline.yml +++ b/backend/reconcile/tests/examples/multiline.yml @@ -7,14 +7,59 @@ left: | right: | Hello there! - Best, Andras expected: | Hello there! - How are you? - Best, Andras + + + How are you? + +--- + +parent: | + - my list + - 2nd item + - 3rd item + +left: | + - my list + - 2nd item + - nested list + - very nested list + - 3rd item + +right: | + - my list + - nested list + - 2nd item + - 3rd item + - another nested list + +expected: | + - my list + - nested list + - 2nd item + - nested list + - very nested list + - 3rd item + - another nested list + +--- + +parent: | + a + a +left: | + a + a +right: | + a + a +expected: | + a + a