Add optimal representation

This commit is contained in:
Andras Schmelczer 2025-10-26 21:19:56 +00:00
parent e052aa46c4
commit 3da0673af6
5 changed files with 325 additions and 35 deletions

View file

@ -151,6 +151,48 @@
//! ]
//! );
//! ```
//! ## Efficiently serialize changes
//!
//! The edits can be serialized into a compact representation without the full
//! original text, making the size only depends on the changes made.
//!
//! ```rust
//! use reconcile_text::{EditedText, BuiltinTokenizer};
//! use serde_yaml;
//! use pretty_assertions::assert_eq;
//!
//!
//! let original = "Merging text is hard!";
//! let changes = "Merging text is easy with reconcile!";
//!
//! let result = EditedText::from_strings(
//! original,
//! &changes.into()
//! );
//!
//! let serialized = serde_yaml::to_string(&result.serialise_as_change_set()).unwrap();
//! assert_eq!(
//! serialized,
//! concat!(
//! "operations:\n",
//! "- 15\n",
//! "- -6\n",
//! "- ' easy with reconcile!'\n",
//! "cursors: []\n"
//! )
//! );
//!
//! let deserialized = serde_yaml::from_str(&serialized).unwrap();
//! let reconstructed = EditedText::from_change_set(
//! original,
//! deserialized,
//! &*BuiltinTokenizer::Word
//! );
//! assert_eq!(
//! reconstructed.apply().text(),
//! "Merging text is easy with reconcile!"
//! );
//! ```
//!
//! ## Error handling
//!
@ -169,7 +211,7 @@ mod tokenizer;
mod types;
mod utils;
pub use operation_transformation::{EditedText, reconcile};
pub use operation_transformation::{ChangeSet, EditedText, reconcile};
pub use tokenizer::{BuiltinTokenizer, Tokenizer, token::Token};
pub use types::{
cursor_position::CursorPosition, history::History, side::Side,

View file

@ -1,10 +1,13 @@
mod edited_text;
mod operation;
mod utils;
mod transport;
use std::fmt::Debug;
pub use edited_text::{ChangeSet, EditedText};
pub use transport::{ChangeSet};
pub use operation::Operation;
pub use edited_text::{EditedText};
use crate::{Tokenizer, types::text_with_cursors::TextWithCursors};

View file

@ -4,9 +4,10 @@ use std::{fmt::Debug, vec};
use serde::{Deserialize, Serialize};
use crate::{
BuiltinTokenizer, CursorPosition, TextWithCursors,
BuiltinTokenizer, ChangeSet, CursorPosition, TextWithCursors,
operation_transformation::{
Operation,
transport::SimpleOperation,
utils::{cook_operations::cook_operations, elongate_operations::elongate_operations},
},
raw_operation::RawOperation,
@ -39,31 +40,6 @@ where
cursors: Vec<CursorPosition>,
}
/// A serializable representation of the changes made to a text document
/// without the original text.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Default)]
pub struct ChangeSet<T>
where
T: PartialEq + Clone + Debug,
{
operations: Vec<Operation<T>>,
cursors: Vec<CursorPosition>,
}
impl<'a, T> ChangeSet<T>
where
T: PartialEq + Clone + Debug,
{
#[must_use]
pub fn new(operations: Vec<Operation<T>>, cursors: Vec<CursorPosition>) -> Self {
Self {
operations,
cursors,
}
}
}
impl<'a> EditedText<'a, String> {
/// Create an `EditedText` from the given original (old) and updated (new)
/// strings. The returned `EditedText` represents the changes from the
@ -370,23 +346,31 @@ where
}
/// Serialize the `EditedText` as a `ChangeSet`, which contains only
/// the operations and cursor positions, without the original text.
/// the operations and cursor positions, but without the original text.
/// This is useful for sending changes over the network if there's
/// a clear consensus on the original text.
#[must_use]
pub fn serialise_as_change_set(&self) -> ChangeSet<T> {
ChangeSet::new(self.operations.clone(), self.cursors.clone())
pub fn serialise_as_change_set(&self) -> ChangeSet {
ChangeSet::new(
SimpleOperation::from_operations(&self.operations),
self.cursors.clone(),
)
}
/// Deserialize an `EditedText` from a `ChangeSet` and the original text.
/// This is useful for reconstructing the `EditedText` on the receiving
/// end after sending only the `ChangeSet` over the network.
#[must_use]
pub fn from_change_set(text: &'a str, change_set: ChangeSet<T>) -> EditedText<'a, T> {
let operation_count = change_set.operations.len();
pub fn from_change_set(
text: &'a str,
change_set: ChangeSet,
tokenizer: &Tokenizer<T>,
) -> EditedText<'a, T> {
let operations = SimpleOperation::to_operations(change_set.operations, text, tokenizer);
let operation_count = operations.len();
EditedText::new(
text,
change_set.operations,
operations,
vec![Side::Left; operation_count],
change_set.cursors,
)
@ -397,6 +381,7 @@ where
mod tests {
use insta::assert_debug_snapshot;
use pretty_assertions::assert_eq;
use serde_yaml;
use super::*;
@ -438,4 +423,36 @@ mod tests {
let operations = operations_1.merge(operations_2);
assert_eq!(operations.apply().text(), expected);
}
#[test]
fn test_change_set_deserialisation() {
let original = "Merging text is hard!";
let changes = "Merging text is easy with reconcile!";
let result = EditedText::from_strings(original, &changes.into());
let serialized = serde_yaml::to_string(&result.serialise_as_change_set()).unwrap();
let expected = concat!(
"operations:\n",
"- 15\n",
"- -6\n",
"- ' easy with reconcile!'\n",
"cursors: []\n"
);
assert_eq!(serialized, expected);
}
#[test]
fn test_change_set_serialization() {
let original = "The quick brown fox jumps over the lazy dog.";
let updated = "The quick red fox jumped over the very lazy dog!";
let edited_text = EditedText::from_strings(original, &updated.into());
let change_set = edited_text.serialise_as_change_set();
let deserialized_edited_text =
EditedText::from_change_set(original, change_set, &*BuiltinTokenizer::Word);
assert_eq!(deserialized_edited_text.apply().text(), updated);
}
}

View file

@ -0,0 +1,198 @@
use std::fmt::Debug;
#[cfg(feature = "serde")]
use serde::{
Deserialize, Serialize,
de::{self, Deserializer, SeqAccess, Visitor},
ser::{SerializeSeq, Serializer},
};
use crate::{CursorPosition, Tokenizer, operation_transformation::Operation};
#[derive(Clone, PartialEq, Eq, Debug)]
pub enum SimpleOperation {
Equal { length: usize },
Insert { text: String },
Delete { length: usize },
}
impl SimpleOperation {
pub fn from_operations<T>(operation: &Vec<Operation<T>>) -> Vec<Self>
where
T: PartialEq + Clone + Debug,
{
let mut result: Vec<Self> = Vec::with_capacity(operation.len());
let mut previous_equal: Option<usize> = None;
for operation in operation {
match operation {
Operation::Equal { length, .. } => {
if let Some(prev_length) = previous_equal {
previous_equal = Some(prev_length + *length);
} else {
previous_equal = Some(*length);
}
}
Operation::Insert { text, .. } => {
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
previous_equal = None;
}
let text: String = text
.iter()
.map(super::super::tokenizer::token::Token::original)
.collect();
result.push(SimpleOperation::Insert { text });
}
Operation::Delete {
deleted_character_count,
..
} => {
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
previous_equal = None;
}
result.push(SimpleOperation::Delete {
length: *deleted_character_count,
});
}
}
}
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
}
result
}
pub fn to_operations<T>(
simple_operations: Vec<Self>,
original_text: &str,
tokenizer: &Tokenizer<T>,
) -> Vec<Operation<T>>
where
T: PartialEq + Clone + Debug,
{
let mut operations: Vec<Operation<T>> = Vec::with_capacity(simple_operations.len());
let mut order = 0;
for simple_operation in simple_operations {
match simple_operation {
SimpleOperation::Equal { length } => {
let original_characters: String =
original_text.chars().skip(order).take(length).collect();
let original_tokens = tokenizer(&original_characters);
for token in original_tokens {
operations
.push(Operation::create_equal(order, token.get_original_length()));
order += token.get_original_length();
}
}
SimpleOperation::Insert { text } => {
let tokens = tokenizer(&text);
operations.push(Operation::create_insert(order, tokens));
}
SimpleOperation::Delete { length } => {
operations.push(Operation::create_delete(order, length));
order += length;
}
}
}
operations
}
}
#[cfg(feature = "serde")]
impl Serialize for SimpleOperation {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
// neat idea from https://github.com/spebern/operational-transform-rs/blob/9faa17f0a2b282ac2e09dbb2d29fdaf2ae0bbb4a/operational-transform/src/serde.rs#L14
match self {
SimpleOperation::Equal { length } => serializer.serialize_u64(*length as u64),
SimpleOperation::Insert { text } => serializer.serialize_str(text),
SimpleOperation::Delete { length } => serializer.serialize_i64(-(*length as i64)),
}
}
}
#[cfg(feature = "serde")]
impl<'de> Deserialize<'de> for SimpleOperation {
fn deserialize<D>(deserializer: D) -> Result<SimpleOperation, D::Error>
where
D: Deserializer<'de>,
{
struct OperationVisitor;
impl<'de> Visitor<'de> for OperationVisitor {
type Value = SimpleOperation;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("an integer between -2^64 and 2^63 or a string")
}
fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Equal {
length: value as usize,
})
}
fn visit_i64<E>(self, value: i64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Delete {
length: (-value) as usize,
})
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Insert {
text: value.to_owned(),
})
}
}
deserializer.deserialize_any(OperationVisitor)
}
}
/// A serializable representation of the changes made to a text document
/// without the original text.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Default)]
pub struct ChangeSet {
pub operations: Vec<SimpleOperation>,
pub cursors: Vec<CursorPosition>,
}
impl ChangeSet {
#[must_use]
pub fn new(operations: Vec<SimpleOperation>, cursors: Vec<CursorPosition>) -> Self {
Self {
operations,
cursors,
}
}
}

View file

@ -3,7 +3,7 @@ mod example_document;
use std::{fs, path::Path};
use example_document::ExampleDocument;
use reconcile_text::{BuiltinTokenizer, reconcile};
use reconcile_text::{BuiltinTokenizer, EditedText, reconcile};
use serde::Deserialize;
#[test]
@ -34,6 +34,36 @@ fn test_document_one_way_with_cursors() {
}
}
#[test]
fn test_document_one_way_with_cursors_and_serialisation() {
for doc in &get_all_documents() {
let parent = doc.parent();
let left_operations =
EditedText::from_strings_with_tokenizer(&parent, &doc.left(), &*BuiltinTokenizer::Word);
let right_operations = EditedText::from_strings_with_tokenizer(
&parent,
&doc.right(),
&*BuiltinTokenizer::Word,
);
let serialised_left = serde_yaml::from_str(
&serde_yaml::to_string(&left_operations.serialise_as_change_set()).unwrap(),
)
.unwrap();
let serialised_right = serde_yaml::from_str(
&serde_yaml::to_string(&right_operations.serialise_as_change_set()).unwrap(),
)
.unwrap();
let restored_left_operations =
EditedText::from_change_set(&parent, serialised_left, &*BuiltinTokenizer::Word);
let restored_right_operations =
EditedText::from_change_set(&parent, serialised_right, &*BuiltinTokenizer::Word);
doc.assert_eq(&restored_left_operations.merge(restored_right_operations));
}
}
#[test]
fn test_document_inverse_way_without_cursors() {
for doc in &get_all_documents() {