diff --git a/src/lib.rs b/src/lib.rs index f745c17..cbe354f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -151,6 +151,48 @@ //! ] //! ); //! ``` +//! ## Efficiently serialize changes +//! +//! The edits can be serialized into a compact representation without the full +//! original text, making the size only depends on the changes made. +//! +//! ```rust +//! use reconcile_text::{EditedText, BuiltinTokenizer}; +//! use serde_yaml; +//! use pretty_assertions::assert_eq; +//! +//! +//! let original = "Merging text is hard!"; +//! let changes = "Merging text is easy with reconcile!"; +//! +//! let result = EditedText::from_strings( +//! original, +//! &changes.into() +//! ); +//! +//! let serialized = serde_yaml::to_string(&result.serialise_as_change_set()).unwrap(); +//! assert_eq!( +//! serialized, +//! concat!( +//! "operations:\n", +//! "- 15\n", +//! "- -6\n", +//! "- ' easy with reconcile!'\n", +//! "cursors: []\n" +//! ) +//! ); +//! +//! let deserialized = serde_yaml::from_str(&serialized).unwrap(); +//! let reconstructed = EditedText::from_change_set( +//! original, +//! deserialized, +//! &*BuiltinTokenizer::Word +//! ); +//! assert_eq!( +//! reconstructed.apply().text(), +//! "Merging text is easy with reconcile!" +//! ); +//! ``` //! //! ## Error handling //! @@ -169,7 +211,7 @@ mod tokenizer; mod types; mod utils; -pub use operation_transformation::{EditedText, reconcile}; +pub use operation_transformation::{ChangeSet, EditedText, reconcile}; pub use tokenizer::{BuiltinTokenizer, Tokenizer, token::Token}; pub use types::{ cursor_position::CursorPosition, history::History, side::Side, diff --git a/src/operation_transformation.rs b/src/operation_transformation.rs index a2ac1c5..0d99ca4 100644 --- a/src/operation_transformation.rs +++ b/src/operation_transformation.rs @@ -1,10 +1,13 @@ mod edited_text; mod operation; mod utils; +mod transport; use std::fmt::Debug; -pub use edited_text::{ChangeSet, EditedText}; + +pub use transport::{ChangeSet}; pub use operation::Operation; +pub use edited_text::{EditedText}; use crate::{Tokenizer, types::text_with_cursors::TextWithCursors}; diff --git a/src/operation_transformation/edited_text.rs b/src/operation_transformation/edited_text.rs index 3894aae..ed60515 100644 --- a/src/operation_transformation/edited_text.rs +++ b/src/operation_transformation/edited_text.rs @@ -4,9 +4,10 @@ use std::{fmt::Debug, vec}; use serde::{Deserialize, Serialize}; use crate::{ - BuiltinTokenizer, CursorPosition, TextWithCursors, + BuiltinTokenizer, ChangeSet, CursorPosition, TextWithCursors, operation_transformation::{ Operation, + transport::SimpleOperation, utils::{cook_operations::cook_operations, elongate_operations::elongate_operations}, }, raw_operation::RawOperation, @@ -39,31 +40,6 @@ where cursors: Vec, } -/// A serializable representation of the changes made to a text document -/// without the original text. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, PartialEq, Default)] -pub struct ChangeSet -where - T: PartialEq + Clone + Debug, -{ - operations: Vec>, - cursors: Vec, -} - -impl<'a, T> ChangeSet -where - T: PartialEq + Clone + Debug, -{ - #[must_use] - pub fn new(operations: Vec>, cursors: Vec) -> Self { - Self { - operations, - cursors, - } - } -} - impl<'a> EditedText<'a, String> { /// Create an `EditedText` from the given original (old) and updated (new) /// strings. The returned `EditedText` represents the changes from the @@ -370,23 +346,31 @@ where } /// Serialize the `EditedText` as a `ChangeSet`, which contains only - /// the operations and cursor positions, without the original text. + /// the operations and cursor positions, but without the original text. /// This is useful for sending changes over the network if there's /// a clear consensus on the original text. #[must_use] - pub fn serialise_as_change_set(&self) -> ChangeSet { - ChangeSet::new(self.operations.clone(), self.cursors.clone()) + pub fn serialise_as_change_set(&self) -> ChangeSet { + ChangeSet::new( + SimpleOperation::from_operations(&self.operations), + self.cursors.clone(), + ) } /// Deserialize an `EditedText` from a `ChangeSet` and the original text. /// This is useful for reconstructing the `EditedText` on the receiving /// end after sending only the `ChangeSet` over the network. #[must_use] - pub fn from_change_set(text: &'a str, change_set: ChangeSet) -> EditedText<'a, T> { - let operation_count = change_set.operations.len(); + pub fn from_change_set( + text: &'a str, + change_set: ChangeSet, + tokenizer: &Tokenizer, + ) -> EditedText<'a, T> { + let operations = SimpleOperation::to_operations(change_set.operations, text, tokenizer); + let operation_count = operations.len(); EditedText::new( text, - change_set.operations, + operations, vec![Side::Left; operation_count], change_set.cursors, ) @@ -397,6 +381,7 @@ where mod tests { use insta::assert_debug_snapshot; use pretty_assertions::assert_eq; + use serde_yaml; use super::*; @@ -438,4 +423,36 @@ mod tests { let operations = operations_1.merge(operations_2); assert_eq!(operations.apply().text(), expected); } + + #[test] + fn test_change_set_deserialisation() { + let original = "Merging text is hard!"; + let changes = "Merging text is easy with reconcile!"; + let result = EditedText::from_strings(original, &changes.into()); + let serialized = serde_yaml::to_string(&result.serialise_as_change_set()).unwrap(); + + let expected = concat!( + "operations:\n", + "- 15\n", + "- -6\n", + "- ' easy with reconcile!'\n", + "cursors: []\n" + ); + + assert_eq!(serialized, expected); + } + + #[test] + fn test_change_set_serialization() { + let original = "The quick brown fox jumps over the lazy dog."; + let updated = "The quick red fox jumped over the very lazy dog!"; + + let edited_text = EditedText::from_strings(original, &updated.into()); + + let change_set = edited_text.serialise_as_change_set(); + let deserialized_edited_text = + EditedText::from_change_set(original, change_set, &*BuiltinTokenizer::Word); + + assert_eq!(deserialized_edited_text.apply().text(), updated); + } } diff --git a/src/operation_transformation/transport.rs b/src/operation_transformation/transport.rs new file mode 100644 index 0000000..4f8fee4 --- /dev/null +++ b/src/operation_transformation/transport.rs @@ -0,0 +1,198 @@ +use std::fmt::Debug; + +#[cfg(feature = "serde")] +use serde::{ + Deserialize, Serialize, + de::{self, Deserializer, SeqAccess, Visitor}, + ser::{SerializeSeq, Serializer}, +}; + +use crate::{CursorPosition, Tokenizer, operation_transformation::Operation}; + +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum SimpleOperation { + Equal { length: usize }, + + Insert { text: String }, + + Delete { length: usize }, +} + +impl SimpleOperation { + pub fn from_operations(operation: &Vec>) -> Vec + where + T: PartialEq + Clone + Debug, + { + let mut result: Vec = Vec::with_capacity(operation.len()); + let mut previous_equal: Option = None; + + for operation in operation { + match operation { + Operation::Equal { length, .. } => { + if let Some(prev_length) = previous_equal { + previous_equal = Some(prev_length + *length); + } else { + previous_equal = Some(*length); + } + } + Operation::Insert { text, .. } => { + if let Some(prev_length) = previous_equal { + result.push(SimpleOperation::Equal { + length: prev_length, + }); + previous_equal = None; + } + + let text: String = text + .iter() + .map(super::super::tokenizer::token::Token::original) + .collect(); + result.push(SimpleOperation::Insert { text }); + } + Operation::Delete { + deleted_character_count, + .. + } => { + if let Some(prev_length) = previous_equal { + result.push(SimpleOperation::Equal { + length: prev_length, + }); + previous_equal = None; + } + + result.push(SimpleOperation::Delete { + length: *deleted_character_count, + }); + } + } + } + + if let Some(prev_length) = previous_equal { + result.push(SimpleOperation::Equal { + length: prev_length, + }); + } + + result + } + + pub fn to_operations( + simple_operations: Vec, + original_text: &str, + tokenizer: &Tokenizer, + ) -> Vec> + where + T: PartialEq + Clone + Debug, + { + let mut operations: Vec> = Vec::with_capacity(simple_operations.len()); + + let mut order = 0; + + for simple_operation in simple_operations { + match simple_operation { + SimpleOperation::Equal { length } => { + let original_characters: String = + original_text.chars().skip(order).take(length).collect(); + + let original_tokens = tokenizer(&original_characters); + for token in original_tokens { + operations + .push(Operation::create_equal(order, token.get_original_length())); + order += token.get_original_length(); + } + } + SimpleOperation::Insert { text } => { + let tokens = tokenizer(&text); + operations.push(Operation::create_insert(order, tokens)); + } + SimpleOperation::Delete { length } => { + operations.push(Operation::create_delete(order, length)); + order += length; + } + } + } + + operations + } +} + +#[cfg(feature = "serde")] +impl Serialize for SimpleOperation { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + // neat idea from https://github.com/spebern/operational-transform-rs/blob/9faa17f0a2b282ac2e09dbb2d29fdaf2ae0bbb4a/operational-transform/src/serde.rs#L14 + match self { + SimpleOperation::Equal { length } => serializer.serialize_u64(*length as u64), + SimpleOperation::Insert { text } => serializer.serialize_str(text), + SimpleOperation::Delete { length } => serializer.serialize_i64(-(*length as i64)), + } + } +} + +#[cfg(feature = "serde")] +impl<'de> Deserialize<'de> for SimpleOperation { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct OperationVisitor; + + impl<'de> Visitor<'de> for OperationVisitor { + type Value = SimpleOperation; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("an integer between -2^64 and 2^63 or a string") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + Ok(SimpleOperation::Equal { + length: value as usize, + }) + } + + fn visit_i64(self, value: i64) -> Result + where + E: de::Error, + { + Ok(SimpleOperation::Delete { + length: (-value) as usize, + }) + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + Ok(SimpleOperation::Insert { + text: value.to_owned(), + }) + } + } + + deserializer.deserialize_any(OperationVisitor) + } +} + +/// A serializable representation of the changes made to a text document +/// without the original text. +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone, PartialEq, Default)] +pub struct ChangeSet { + pub operations: Vec, + pub cursors: Vec, +} + +impl ChangeSet { + #[must_use] + pub fn new(operations: Vec, cursors: Vec) -> Self { + Self { + operations, + cursors, + } + } +} diff --git a/tests/test.rs b/tests/test.rs index e9968b9..00f5163 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -3,7 +3,7 @@ mod example_document; use std::{fs, path::Path}; use example_document::ExampleDocument; -use reconcile_text::{BuiltinTokenizer, reconcile}; +use reconcile_text::{BuiltinTokenizer, EditedText, reconcile}; use serde::Deserialize; #[test] @@ -34,6 +34,36 @@ fn test_document_one_way_with_cursors() { } } +#[test] +fn test_document_one_way_with_cursors_and_serialisation() { + for doc in &get_all_documents() { + let parent = doc.parent(); + let left_operations = + EditedText::from_strings_with_tokenizer(&parent, &doc.left(), &*BuiltinTokenizer::Word); + let right_operations = EditedText::from_strings_with_tokenizer( + &parent, + &doc.right(), + &*BuiltinTokenizer::Word, + ); + + let serialised_left = serde_yaml::from_str( + &serde_yaml::to_string(&left_operations.serialise_as_change_set()).unwrap(), + ) + .unwrap(); + let serialised_right = serde_yaml::from_str( + &serde_yaml::to_string(&right_operations.serialise_as_change_set()).unwrap(), + ) + .unwrap(); + + let restored_left_operations = + EditedText::from_change_set(&parent, serialised_left, &*BuiltinTokenizer::Word); + let restored_right_operations = + EditedText::from_change_set(&parent, serialised_right, &*BuiltinTokenizer::Word); + + doc.assert_eq(&restored_left_operations.merge(restored_right_operations)); + } +} + #[test] fn test_document_inverse_way_without_cursors() { for doc in &get_all_documents() {