This commit is contained in:
Andras Schmelczer 2024-11-17 22:12:27 +00:00
parent a471bf6855
commit 7f6973389f
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
21 changed files with 30682 additions and 236 deletions

View file

@ -0,0 +1,18 @@
[package]
name = "reconcile"
version = "0.1.0"
edition = "2021"
[dependencies]
ropey = { version = "1.6.1", default-features = false, features = ["simd"] } #
thiserror = {workspace = true}
log = {workspace = true}
serde = { version = "1.0.215", optional = true }
[features]
serde = [ "dep:serde" ]
[dev-dependencies]
insta = "1.41.1"
itertools = "0.13.0"
pretty_assertions = "1.4.1"

View file

@ -0,0 +1,165 @@
//! LCS diff algorithm.
//!
//! * time: `O((NM)D log (M)D)`
//! * space `O(MN)`
use std::collections::BTreeMap;
use std::ops::{Index, Range};
use crate::tokenizer::token::Token;
use super::raw_operation::RawOperation;
use super::utils::{common_prefix_len, common_suffix_len};
/// LCS diff algorithm.
/// Copied from https://github.com/mitsuhiko/similar/blob/7e15c44de11a1cd61e1149189929e189ef977fd8/src/algorithms/lcs.rs
pub fn diff(old: &[Token], new: &[Token]) -> Vec<RawOperation> {
let common_prefix_len = common_prefix_len(old, 0..old.len(), new, 0..new.len());
let common_suffix_len = common_suffix_len(
old,
common_prefix_len..old.len(),
new,
common_prefix_len..new.len(),
);
let maybe_table = make_table(
old,
common_prefix_len..(old.len() - common_suffix_len),
new,
common_prefix_len..(new.len() - common_suffix_len),
);
let mut old_idx = 0;
let mut new_idx = 0;
let new_len = new.len() - common_prefix_len - common_suffix_len;
let old_len = old.len() - common_prefix_len - common_suffix_len;
let mut result: Vec<RawOperation> = Vec::new();
if common_prefix_len > 0 {
result.push(RawOperation::Equal(old[0..common_prefix_len].to_vec()));
}
if let Some(table) = maybe_table {
while new_idx < new_len && old_idx < old_len {
let old_orig_idx = common_prefix_len + old_idx;
let new_orig_idx = common_prefix_len + new_idx;
if new[new_orig_idx] == old[old_orig_idx] {
result.push(RawOperation::Equal(vec![old[old_orig_idx].clone()]));
old_idx += 1;
new_idx += 1;
} else if table.get(&(new_idx, old_idx + 1)).unwrap_or(&0)
>= table.get(&(new_idx + 1, old_idx)).unwrap_or(&0)
{
result.push(RawOperation::Delete(vec![old[old_orig_idx].clone()]));
old_idx += 1;
} else {
result.push(RawOperation::Insert(vec![new[new_orig_idx].clone()]));
new_idx += 1;
}
}
} else {
let old_orig_idx = common_prefix_len + old_idx;
let new_orig_idx = common_prefix_len + new_idx;
result.push(RawOperation::Delete(
old[old_orig_idx..old_orig_idx + old_len].to_vec(),
));
result.push(RawOperation::Insert(
new[new_orig_idx..new_orig_idx + new_len].to_vec(),
));
}
if old_idx < old_len {
result.push(RawOperation::Delete(
old[common_prefix_len + old_idx..common_prefix_len + old_len].to_vec(),
));
old_idx += old_len - old_idx;
}
if new_idx < new_len {
result.push(RawOperation::Insert(
new[common_prefix_len + new_idx..common_prefix_len + new_len].to_vec(),
));
}
if common_suffix_len > 0 {
result.push(RawOperation::Equal(
old[old_len + common_prefix_len..old_len + common_prefix_len + common_suffix_len]
.to_vec(),
));
}
result
}
fn make_table<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> Option<BTreeMap<(usize, usize), u32>>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
let old_len = old_range.len();
let new_len = new_range.len();
let mut table = BTreeMap::new();
for i in (0..new_len).rev() {
for j in (0..old_len).rev() {
let val = if new[i] == old[j] {
table.get(&(i + 1, j + 1)).unwrap_or(&0) + 1
} else {
*table
.get(&(i + 1, j))
.unwrap_or(&0)
.max(table.get(&(i, j + 1)).unwrap_or(&0))
};
if val > 0 {
table.insert((i, j), val);
}
}
}
Some(table)
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use std::collections::BTreeMap;
#[test]
fn test_table() {
let table = make_table(&vec![2, 3], 0..2, &vec![0, 1, 2], 0..3).unwrap();
let expected = {
let mut m = BTreeMap::new();
m.insert((1, 0), 1);
m.insert((0, 0), 1);
m.insert((2, 0), 1);
m
};
assert_eq!(table, expected);
}
#[test]
fn test_empty_examples() {
assert_eq!(diff(&[], &[]), vec![]);
assert_eq!(
diff(&[Token::new("a".to_string(), "a".to_string())], &[]),
vec![RawOperation::Delete(vec![Token::new(
"a".to_string(),
"a".to_string()
)])]
);
assert_eq!(
diff(&[], &[Token::new("a".to_string(), "a".to_string())]),
vec![RawOperation::Insert(vec![Token::new(
"a".to_string(),
"a".to_string()
)])]
);
}
}

View file

@ -0,0 +1,4 @@
pub mod lcs;
pub mod myers;
pub mod raw_operation;
mod utils;

View file

@ -0,0 +1,310 @@
//! Taken from https://github.com/mitsuhiko/similar/blob/7e15c44de11a1cd61e1149189929e189ef977fd8/src/algorithms/myers.rs
//! Myers' diff algorithm.
//!
//! * time: `O((N+M)D)`
//! * space `O(N+M)`
//!
//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
//! describing it.
//!
//! The implementation of this algorithm is based on the implementation by
//! Brandon Williams.
//!
//! # Heuristics
//!
//! At present this implementation of Myers' does not implement any more advanced
//! heuristics that would solve some pathological cases. For instance passing two
//! large and completely distinct sequences to the algorithm will make it spin
//! without making reasonable progress. Currently the only protection in the
//! library against this is to pass a deadline to the diffing algorithm.
//!
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
use std::ops::{Index, IndexMut, Range};
use std::time::Instant;
use std::vec;
use crate::tokenizer::token::Token;
use super::raw_operation::RawOperation;
use super::utils::{common_prefix_len, common_suffix_len};
/// Myers' diff algorithm.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
pub fn diff(old: &[Token], new: &[Token]) -> Vec<RawOperation> {
diff_deadline(old, new, None)
}
/// Myers' diff algorithm with deadline.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline(old: &[Token], new: &[Token], deadline: Option<Instant>) -> Vec<RawOperation> {
let max_d = max_d(old.len(), new.len());
let mut vb = V::new(max_d);
let mut vf = V::new(max_d);
let mut result: Vec<RawOperation> = vec![];
conquer(
old,
0..old.len(),
new,
0..new.len(),
&mut vf,
&mut vb,
&mut result,
deadline,
);
result
}
// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
// and then a possibly empty sequence of diagonal edges called a snake.
/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
/// where `V[k]` contains the row index of the endpoint of the furthest reaching
/// path in diagonal `k`.
///
/// We can't use a traditional Vec to represent `V` since we use `k` as an index
/// and it can take on negative values. So instead `V` is represented as a
/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
/// `k` can take on in order to map negative `k`'s back to a value >= 0.
#[derive(Debug)]
struct V {
offset: isize,
v: Vec<usize>, // Look into initializing this to -1 and storing isize
}
impl V {
fn new(max_d: usize) -> Self {
Self {
offset: max_d as isize,
v: vec![0; 2 * max_d],
}
}
fn len(&self) -> usize {
self.v.len()
}
}
impl Index<isize> for V {
type Output = usize;
fn index(&self, index: isize) -> &Self::Output {
&self.v[(index + self.offset) as usize]
}
}
impl IndexMut<isize> for V {
fn index_mut(&mut self, index: isize) -> &mut Self::Output {
&mut self.v[(index + self.offset) as usize]
}
}
fn max_d(len1: usize, len2: usize) -> usize {
// XXX look into reducing the need to have the additional '+ 1'
(len1 + len2 + 1) / 2 + 1
}
#[inline(always)]
fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
(range.start..at, at..range.end)
}
/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
/// a snake has a start end end point (and it is possible for a snake to have
/// a length of zero, meaning the start and end points are the same) however
/// we do not need the end point which is why it's not implemented here.
///
/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
/// some of which may be empty. The divide step requires finding the ceil(D/2) +
/// 1 or middle snake of an optimal D-path. The idea for doing so is to
/// simultaneously run the basic algorithm in both the forward and reverse
/// directions until furthest reaching forward and reverse paths starting at
/// opposing corners 'overlap'.
fn find_middle_snake(
old: &[Token],
old_range: Range<usize>,
new: &[Token],
new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
deadline: Option<Instant>,
) -> Option<(usize, usize)> {
let n = old_range.len();
let m = new_range.len();
// By Lemma 1 in the paper, the optimal edit script length is odd or even as
// `delta` is odd or even.
let delta = n as isize - m as isize;
let odd = delta & 1 == 1;
// The initial point at (0, -1)
vf[1] = 0;
// The initial point at (N, M+1)
vb[1] = 0;
// We only need to explore ceil(D/2) + 1
let d_max = max_d(n, m);
assert!(vf.len() >= d_max);
assert!(vb.len() >= d_max);
for d in 0..d_max as isize {
// are we running for too long?
if let Some(deadline) = deadline {
if Instant::now() > deadline {
break;
}
}
// Forward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
vf[k + 1]
} else {
vf[k - 1] + 1
};
let y = (x as isize - k) as usize;
// The coordinate of the start of a snake
let (x0, y0) = (x, y);
// While these sequences are identical, keep moving through the
// graph with no cost
if x < old_range.len() && y < new_range.len() {
let advance = common_prefix_len(
old,
old_range.start + x..old_range.end,
new,
new_range.start + y..new_range.end,
);
x += advance;
}
// This is the new best x value
vf[k] = x;
// Only check for connections from the forward search when N - M is
// odd and when there is a reciprocal k line coming from the other
// direction.
if odd && (k - delta).abs() <= (d - 1) {
// TODO optimize this so we don't have to compare against n
if vf[k] + vb[-(k - delta)] >= n {
// Return the snake
return Some((x0 + old_range.start, y0 + new_range.start));
}
}
}
// Backward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
vb[k + 1]
} else {
vb[k - 1] + 1
};
let mut y = (x as isize - k) as usize;
// The coordinate of the start of a snake
if x < n && y < m {
let advance = common_suffix_len(
old,
old_range.start..old_range.start + n - x,
new,
new_range.start..new_range.start + m - y,
);
x += advance;
y += advance;
}
// This is the new best x value
vb[k] = x;
if !odd && (k - delta).abs() <= d {
// TODO optimize this so we don't have to compare against n
if vb[k] + vf[-(k - delta)] >= n {
// Return the snake
return Some((n - x + old_range.start, m - y + new_range.start));
}
}
}
// TODO: Maybe there's an opportunity to optimize and bail early?
}
// deadline reached
None
}
fn conquer(
old: &[Token],
mut old_range: Range<usize>,
new: &[Token],
mut new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
result: &mut Vec<RawOperation>,
deadline: Option<Instant>,
) {
// Check for common prefix
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
if common_prefix_len > 0 {
result.push(RawOperation::Equal(
old[old_range.start..old_range.start + common_prefix_len].to_vec(),
));
}
old_range.start += common_prefix_len;
new_range.start += common_prefix_len;
// Check for common suffix
let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
let common_suffix = (
old_range.end - common_suffix_len,
new_range.end - common_suffix_len,
);
old_range.end -= common_suffix_len;
new_range.end -= common_suffix_len;
if old_range.is_empty() && new_range.is_empty() {
// Do nothing
} else if new_range.is_empty() {
result.push(RawOperation::Delete(
old[old_range.start..old_range.start + old_range.len()].to_vec(),
));
} else if old_range.is_empty() {
result.push(RawOperation::Insert(
new[new_range.start..new_range.start + new_range.len()].to_vec(),
));
} else if let Some((x_start, y_start)) = find_middle_snake(
old,
old_range.clone(),
new,
new_range.clone(),
vf,
vb,
deadline,
) {
let (old_a, old_b) = split_at(old_range, x_start);
let (new_a, new_b) = split_at(new_range, y_start);
conquer(old, old_a, new, new_a, vf, vb, result, deadline);
conquer(old, old_b, new, new_b, vf, vb, result, deadline);
} else {
result.push(RawOperation::Delete(
old[old_range.start..old_range.end].to_vec(),
));
result.push(RawOperation::Insert(
new[new_range.start..new_range.end].to_vec(),
));
}
if common_suffix_len > 0 {
result.push(RawOperation::Equal(
old[common_suffix.0..common_suffix.0 + common_suffix_len].to_vec(),
));
}
}

View file

@ -0,0 +1,47 @@
use crate::tokenizer::token::Token;
#[derive(Debug, Clone, PartialEq)]
pub enum RawOperation {
Insert(Vec<Token>),
Delete(Vec<Token>),
Equal(Vec<Token>),
}
impl RawOperation {
pub fn tokens(&self) -> &Vec<Token> {
match self {
RawOperation::Insert(tokens) => tokens,
RawOperation::Delete(tokens) => tokens,
RawOperation::Equal(tokens) => tokens,
}
}
pub fn original_text_length(&self) -> usize {
self.tokens()
.iter()
.map(|t| t.original.chars().count())
.sum()
}
pub fn get_original_text(self) -> String {
self.tokens().iter().map(|t| t.original.clone()).collect()
}
/// Extends the operation with another operation if returning the new operation.
/// Only operations of the same type can be used to extend. If the operations are of different
/// types, returns None.
pub fn extend(&self, other: &RawOperation) -> Option<RawOperation> {
match (self, other) {
(RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some(
RawOperation::Insert(tokens1.iter().chain(tokens2.iter()).cloned().collect()),
),
(RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => Some(
RawOperation::Delete(tokens1.iter().chain(tokens2.iter()).cloned().collect()),
),
(RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some(
RawOperation::Equal(tokens1.iter().chain(tokens2.iter()).cloned().collect()),
),
_ => None,
}
}
}

View file

@ -0,0 +1,86 @@
use std::ops::{Index, Range};
/// Given two lookups and ranges calculates the length of the common prefix.
/// Copied from https://github.com/mitsuhiko/similar/blob/7e15c44de11a1cd61e1149189929e189ef977fd8/src/algorithms/utils.rs
pub fn common_prefix_len<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> usize
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
new_range
.zip(old_range)
.take_while(|x| new[x.0] == old[x.1])
.count()
}
/// Given two lookups and ranges calculates the length of common suffix.
/// Copied from https://github.com/mitsuhiko/similar/blob/7e15c44de11a1cd61e1149189929e189ef977fd8/src/algorithms/utils.rs
pub fn common_suffix_len<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> usize
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
new_range
.rev()
.zip(old_range.rev())
.take_while(|x| new[x.0] == old[x.1])
.count()
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_common_prefix_len() {
assert_eq!(
common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
0
);
assert_eq!(
common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10),
7
);
assert_eq!(
common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9),
0
);
assert_eq!(
common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10),
4
);
}
#[test]
fn test_common_suffix_len() {
assert_eq!(
common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
0
);
assert_eq!(
common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8),
4
);
assert_eq!(
common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4),
0
);
assert_eq!(
common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5),
2
);
}
}

View file

@ -0,0 +1,10 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum SyncLibError {
#[error("Failed to shift the operation's index {0}")]
NegativeOperationIndexError(String),
#[error("Failed to apply operation because {0}")]
OperationApplicationError(String),
}

View file

@ -0,0 +1,4 @@
mod diffs;
pub mod errors;
pub mod operations;
mod tokenizer;

View file

@ -0,0 +1,25 @@
mod operation;
mod operation_sequence;
pub use operation::Operation;
pub use operation_sequence::OperationSequence;
#[cfg(test)]
mod test {
#[test]
fn test_merge() {
// let mut original = Rope::from_str("hello world!");
// let edit_1 = "hi, world";
// let edit_2 = "hello, my friend!";
// let mut operations_1 = calculate_operations(&original.to_string(), edit_1, 1.0).unwrap();
// let mut operations_2 = calculate_operations(&original.to_string(), edit_2, 1.0).unwrap();
// let result =
// merge_and_apply_operations(&mut original, &mut operations_1, &mut operations_2)
// .unwrap();
// assert_eq!(result, "hey, my friend!");
}
}

View file

@ -1,13 +1,16 @@
use ropey::Rope;
use serde::{Deserialize, Serialize};
use std::fmt::Display;
use crate::errors::SyncLibError;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
/// Represents a change that can be applied to a text document.
/// Operation is tied to a ropey::Rope and is mainly expected to be
/// created by OperationSequence.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Operation {
Insert {
index: usize,
@ -17,57 +20,88 @@ pub enum Operation {
Delete {
index: usize,
deleted_character_count: usize,
#[cfg(debug_assertions)]
deleted_text: Option<String>,
},
}
impl Operation {
/// Creates an insert operation with the given index and text.
/// If the text is empty (meaning that the operation would be a no-op), returns None.
pub fn create_insert(index: usize, text: &str) -> Result<Option<Self>, SyncLibError> {
pub fn create_insert(index: usize, text: String) -> Option<Self> {
if text.is_empty() {
return Ok(None);
return None;
}
Ok(Some(Operation::Insert {
index,
text: text.to_string(),
}))
Some(Operation::Insert { index, text })
}
/// Creates a delete operation with the given index and number of to-be-deleted characters.
/// If the operation would delete 0 (meaning that the operation would be a no-op), returns None.
pub fn create_delete(
index: usize,
deleted_character_count: usize,
) -> Result<Option<Self>, SyncLibError> {
pub fn create_delete(index: usize, deleted_character_count: usize) -> Option<Self> {
if deleted_character_count == 0 {
return Ok(None);
return None;
}
Ok(Some(Operation::Delete {
Some(Operation::Delete {
index,
deleted_character_count,
}))
#[cfg(debug_assertions)]
deleted_text: None,
})
}
pub fn create_delete_with_text(index: usize, text: String) -> Option<Self> {
if text.is_empty() {
return None;
}
Some(Operation::Delete {
index,
deleted_character_count: text.chars().count(),
#[cfg(debug_assertions)]
deleted_text: Some(text),
})
}
/// Tries to apply the operation to the given ropey::Rope text, returning the modified text.
pub fn apply<'a>(&self, rope_text: &'a mut Rope) -> Result<&'a mut Rope, SyncLibError> {
let index: usize = self.start_index();
match self {
Operation::Insert { text, .. } => rope_text.try_insert(index, text).map_err(|err| {
SyncLibError::OperationApplicationError(format!("Failed to insert text: {}", err))
}),
Operation::Delete {
deleted_character_count,
..
} => rope_text
.try_remove(index..index + { *deleted_character_count })
Operation::Insert { text, .. } => rope_text
.try_insert(self.start_index(), text)
.map_err(|err| {
SyncLibError::OperationApplicationError(format!(
"Failed to insert text: {}",
err
))
}),
Operation::Delete {
#[cfg(debug_assertions)]
deleted_text,
..
} => {
debug_assert!(
rope_text.get_slice(self.range()).is_some(),
"Failed to get slice of text to delete"
);
if let Some(text) = deleted_text {
debug_assert_eq!(
rope_text.get_slice(self.range()).unwrap().to_string(),
*text
);
}
rope_text.try_remove(self.range()).map_err(|err| {
SyncLibError::OperationApplicationError(format!(
"Failed to remove text: {}",
err
))
}),
})
}
}?;
Ok(rope_text)
@ -104,33 +138,40 @@ impl Operation {
}
/// Clones the operation while updating the index.
pub fn with_index(&self, index: usize) -> Result<Self, SyncLibError> {
Ok(match self {
pub fn with_index(&self, index: usize) -> Self {
match self {
Operation::Insert { text, .. } => Operation::Insert {
index,
text: text.clone(),
},
Operation::Delete {
deleted_character_count,
#[cfg(debug_assertions)]
deleted_text,
..
} => Operation::Delete {
index,
deleted_character_count: *deleted_character_count,
#[cfg(debug_assertions)]
deleted_text: deleted_text.clone(),
},
})
}
}
/// Clones the operation while shifting the index by the given offset.
/// The offset can be negative but the resulting index must be non-negative.
pub fn with_shifted_index(&self, offset: i64) -> Result<Self, SyncLibError> {
let index = self.start_index() as i64 + offset;
self.with_index(index.try_into().map_err(|_| {
let non_negative_index = index.try_into().map_err(|_| {
SyncLibError::NegativeOperationIndexError(format!(
"Index {} is negative but operations must have a non-negative index",
index
))
})?)
})?;
Ok(self.with_index(non_negative_index))
}
}
@ -138,17 +179,29 @@ impl Display for Operation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Operation::Insert { index, text } => {
write!(f, "Insert '{}' from index {}", text, index)
write!(f, "<insert '{}' from index {}>", text, index)
}
Operation::Delete {
index,
deleted_character_count,
#[cfg(debug_assertions)]
deleted_text,
} => {
write!(
f,
"Delete {} characters index {}",
deleted_character_count, index
)
if cfg!(debug_assertions) {
write!(
f,
"<delete '{}' from index {}>",
deleted_text.as_ref().unwrap_or(&"<unknown>".to_string()),
index
)
} else {
write!(
f,
"<delete {} characters () from index {}>",
deleted_character_count, index
)
}
}
}
}
@ -161,31 +214,15 @@ mod tests {
#[test]
fn test_shifting_error() {
insta::assert_debug_snapshot!(Operation::create_insert(1, "hi")
.unwrap()
insta::assert_debug_snapshot!(Operation::create_insert(1, "hi".to_string())
.unwrap()
.with_shifted_index(-2));
}
#[test]
fn test_apply_delete() -> Result<(), SyncLibError> {
let mut rope = Rope::from_str("hello world");
let operation = Operation::Delete {
index: 5,
deleted_character_count: 6,
};
operation.apply(&mut rope)?;
assert_eq!(rope.to_string(), "hello");
Ok(())
}
#[test]
fn test_apply_delete_with_create() -> Result<(), SyncLibError> {
let mut rope = Rope::from_str("hello world");
let operation = Operation::create_delete(5, 6)?.unwrap();
let operation = Operation::create_delete_with_text(5, "world ".to_string()).unwrap();
operation.apply(&mut rope)?;
@ -197,22 +234,7 @@ mod tests {
#[test]
fn test_apply_insert() -> Result<(), SyncLibError> {
let mut rope = Rope::from_str("hello");
let operation = Operation::Insert {
index: 5,
text: " my friend".to_string(),
};
operation.apply(&mut rope)?;
assert_eq!(rope.to_string(), "hello my friend");
Ok(())
}
#[test]
fn test_apply_insert_with_create() -> Result<(), SyncLibError> {
let mut rope = Rope::from_str("hello");
let operation = Operation::create_insert(5, " my friend")?.unwrap();
let operation = Operation::create_insert(5, " my friend".to_string()).unwrap();
operation.apply(&mut rope)?;

View file

@ -1,81 +1,90 @@
use std::cmp::Ordering;
use super::Operation;
use crate::diffs::myers::diff;
use crate::diffs::raw_operation::RawOperation;
use crate::errors::SyncLibError;
use crate::tokenizer::token::Token;
use ropey::Rope;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use similar::Algorithm;
use similar::{utils::TextDiffRemapper, ChangeTag, TextDiff};
#[derive(Debug, Clone, Default)]
struct MergeContext {
previous_delete: Option<Operation>,
last_delete: Option<Operation>,
shift: i64,
}
pub fn tokenize(text: &str) -> Vec<&str> {
text.split_inclusive(|c: char| c.is_whitespace()).collect()
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
/// A sequence of operations that can be applied to a text document.
/// OperationSequence supports merging two sequences of operations using the
/// principle of Operational Transformation.
///
/// It's mainly created through the from_strings method, then merged with another
/// OperationSequence derived from the same original text and then applied to the original text
/// to get the reconciled text of concurrent edits.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
pub struct OperationSequence {
operations: Vec<Operation>,
}
impl OperationSequence {
/// Creates a new OperationSequence with the given operations.
/// The operations should be in the order they should be applied.
/// The operations must not overlap.
pub fn new(operations: Vec<Operation>) -> Self {
operations
.iter()
.zip(operations.iter().skip(1))
.for_each(|(previous, next)| {
debug_assert!(
previous.start_index() <= next.start_index(),
"{} doesn't come before {}",
previous,
next
);
});
Self { operations }
}
pub fn try_from_string_diff(
left: &str,
right: &str,
diff_ratio_threshold: f32,
) -> Result<Self, SyncLibError> {
let left_tokens = tokenize(left);
let right_tokens = tokenize(right);
/// Creates an OperationSequence from the given original (old) and updated (new) strings.
/// The returned OperationSequence represents the changes from the original to the updated text.
/// When the return value is applied to the original text, it will result in the updated text.
pub fn from_strings(original: &str, updated: &str) -> Self {
let original_tokens = Token::tokenize(original);
let updated_tokens = Token::tokenize(updated);
let diff = TextDiff::configure()
.algorithm(Algorithm::Patience)
.diff_slices(&left_tokens, &right_tokens);
let diff: Vec<RawOperation> = diff(&original_tokens, &updated_tokens);
let diff_ratio = 1.0 - diff.ratio();
if diff_ratio > diff_ratio_threshold {
return Err(SyncLibError::DiffTooLarge {
diff_ratio,
diff_ratio_limit: diff_ratio_threshold,
});
}
let remapper = TextDiffRemapper::from_text_diff(&diff, left, right);
let mut index = 0;
diff.ops()
.iter()
.flat_map(move |x| remapper.iter_slices(x))
.map(|(tag, text)| match tag {
ChangeTag::Equal => {
index += text.chars().count();
Ok(None)
}
ChangeTag::Insert => {
let result = Operation::create_insert(index, text);
index += text.chars().count();
result
}
ChangeTag::Delete => Operation::create_delete(index, text.chars().count()),
})
.flat_map(|result| result.transpose().into_iter())
.collect::<Result<Vec<_>, SyncLibError>>()
.map(Self::new)
Self::new(Self::raw_operations_to_operations(diff))
}
pub fn apply<'a>(&self, rope_text: &'a mut Rope) -> Result<&'a mut Rope, SyncLibError> {
for operation in &self.operations {
operation.apply(rope_text)?;
}
Ok(rope_text)
fn raw_operations_to_operations(raw_operations: Vec<RawOperation>) -> Vec<Operation> {
let mut index = 0;
raw_operations
.into_iter()
.flat_map(|raw_operation| {
match raw_operation {
RawOperation::Equal(..) => {
index += raw_operation.original_text_length();
None
}
RawOperation::Insert(..) => {
let length = raw_operation.original_text_length();
let result =
Operation::create_insert(index, raw_operation.get_original_text());
index += length;
result
}
RawOperation::Delete(..) => {
Operation::create_delete_with_text(index, raw_operation.get_original_text())
}
}
.into_iter()
})
.collect()
}
pub fn merge(&self, other: &Self) -> Result<Self, SyncLibError> {
@ -113,14 +122,12 @@ impl OperationSequence {
})
.transpose()?;
println!();
let left_op_index = shifted_left_op
.as_ref()
.map(|op| {
op.start_index().max(
left_merge_context
.previous_delete
.last_delete
.as_ref()
.map(|op| op.end_index())
.unwrap_or_default(),
@ -133,7 +140,7 @@ impl OperationSequence {
.map(|op| {
op.start_index().max(
right_merge_context
.previous_delete
.last_delete
.as_ref()
.map(|op| op.end_index())
.unwrap_or_default(),
@ -141,16 +148,6 @@ impl OperationSequence {
})
.unwrap_or_default();
println!(
"{:#?} (idx {}) <> {:#?} (idx {})",
shifted_left_op.clone(),
left_op_index,
shifted_right_op.clone(),
right_op_index
);
println!("{:?} <> {:?}", left_merge_context, right_merge_context);
let result = left_op_index.cmp(&right_op_index);
let order = if result == Ordering::Equal
&& shifted_left_op.is_some()
@ -171,8 +168,6 @@ impl OperationSequence {
match (shifted_left_op, shifted_right_op, order) {
(Some(left_op), None, _)
| (Some(left_op), Some(_), std::cmp::Ordering::Less | std::cmp::Ordering::Equal) => {
println!("Left op: {:?}", left_op);
if let Some(op) = Self::merge_operations_with_context(
left_op,
&mut right_merge_context,
@ -185,8 +180,6 @@ impl OperationSequence {
}
(None, Some(right_op), _)
| (Some(_), Some(right_op), std::cmp::Ordering::Greater) => {
println!("Right op: {:?}", right_op);
if let Some(op) = Self::merge_operations_with_context(
right_op,
&mut left_merge_context,
@ -201,21 +194,26 @@ impl OperationSequence {
break;
}
};
println!("last {:?}", merged_operations.last().unwrap());
println!("{:?} <> {:?}", left_merge_context, right_merge_context);
}
Ok(Self::new(merged_operations))
}
pub fn apply<'a>(&self, rope_text: &'a mut Rope) -> Result<&'a mut Rope, SyncLibError> {
for operation in &self.operations {
operation.apply(rope_text)?;
}
Ok(rope_text)
}
fn merge_operations_with_context(
aligned_operation: Operation,
affecting_context: &mut MergeContext,
produced_context: &mut MergeContext,
) -> Result<Option<Operation>, SyncLibError> {
Ok(
match (aligned_operation, affecting_context.previous_delete.clone()) {
match (aligned_operation, affecting_context.last_delete.clone()) {
(operation @ Operation::Insert { .. }, None) => {
produced_context.shift += operation.len() as i64;
Some(operation)
@ -229,17 +227,16 @@ impl OperationSequence {
Some(operation)
}
(operation @ Operation::Insert { .. }, Some(previous_delete)) => {
(operation @ Operation::Insert { .. }, Some(last_delete)) => {
produced_context.shift += operation.len() as i64;
if previous_delete.range().contains(&operation.start_index()) {
let moved_operation =
operation.with_index(previous_delete.start_index())?;
if last_delete.range().contains(&operation.start_index()) {
let moved_operation = operation.with_index(last_delete.start_index());
affecting_context.previous_delete = Operation::create_delete(
affecting_context.last_delete = Operation::create_delete(
moved_operation.end_index() + 1,
previous_delete.len(),
)?;
last_delete.len(),
);
Some(moved_operation)
} else {
@ -247,27 +244,24 @@ impl OperationSequence {
}
}
(operation @ Operation::Delete { .. }, Some(previous_delete)) => {
let updated_delete = if previous_delete
.range()
.contains(&operation.start_index())
{
(operation @ Operation::Delete { .. }, Some(last_delete)) => {
let updated_delete = if last_delete.range().contains(&operation.start_index()) {
let overlap =
previous_delete.end_index() as i64 - operation.start_index() as i64 + 1;
last_delete.end_index() as i64 - operation.start_index() as i64 + 1;
affecting_context.previous_delete = Operation::create_delete(
previous_delete.start_index(),
0.max(previous_delete.len() as i64 - operation.len() as i64) as usize,
)?;
affecting_context.last_delete = Operation::create_delete(
last_delete.start_index(),
0.max(last_delete.len() as i64 - operation.len() as i64) as usize,
);
if previous_delete.end_index() < operation.end_index() {
affecting_context.shift -= previous_delete.len() as i64 - overlap
if last_delete.end_index() < operation.end_index() {
affecting_context.shift -= last_delete.len() as i64 - overlap
}
Operation::create_delete(
previous_delete.start_index(),
last_delete.start_index(),
0.max(operation.len() as i64 - overlap) as usize,
)?
)
} else {
Some(operation)
};
@ -286,24 +280,24 @@ impl OperationSequence {
produced_context: &mut MergeContext,
delete: Option<Operation>,
) {
if let Some(produced_previous_delete) = produced_context.previous_delete.take() {
produced_context.shift -= produced_previous_delete.len() as i64;
if let Some(produced_last_delete) = produced_context.last_delete.take() {
produced_context.shift -= produced_last_delete.len() as i64;
}
produced_context.previous_delete = delete;
produced_context.last_delete = delete;
}
fn pick_up_dangling_delete_from_affecting_context(
next_operation: &Operation,
affecting_context: &mut MergeContext,
) {
match affecting_context.previous_delete.as_ref() {
Some(previous_delete)
match affecting_context.last_delete.as_ref() {
Some(last_delete)
if next_operation.start_index() as i64 + affecting_context.shift
> previous_delete.end_index() as i64 =>
> last_delete.end_index() as i64 =>
{
affecting_context.shift -= previous_delete.len() as i64;
affecting_context.previous_delete = None;
affecting_context.shift -= last_delete.len() as i64;
affecting_context.last_delete = None;
}
_ => {}
}
@ -320,20 +314,18 @@ mod tests {
use super::*;
#[test]
fn test_calculate_operations() -> Result<(), SyncLibError> {
fn test_calculate_operations() {
let left = "hello world! How are you? Adam";
let right = "Hello, my friend! How are you doing? Albert";
let operations = OperationSequence::try_from_string_diff(left, right, 0.8)?;
let operations = OperationSequence::from_strings(left, right);
insta::assert_debug_snapshot!(operations);
let mut left = Rope::from_str(left);
let new_right = operations.apply(&mut left)?;
let new_right = operations.apply(&mut left).unwrap();
assert_eq!(new_right.to_string(), right);
Ok(())
}
#[test]
@ -341,26 +333,24 @@ mod tests {
let left = "hello world! How are you? Adam";
let right = "Hello, my friend! How are you doing? Albert";
let result = OperationSequence::try_from_string_diff(left, right, 0.1);
let result = OperationSequence::from_strings(left, right);
insta::assert_debug_snapshot!(result);
}
#[test]
fn test_calculate_operations_with_no_diff() -> Result<(), SyncLibError> {
fn test_calculate_operations_with_no_diff() {
let left = "hello world!";
let right = "hello world!";
let operations = OperationSequence::try_from_string_diff(left, right, 0.0)?;
let operations = OperationSequence::from_strings(left, right);
assert_eq!(operations.operations.len(), 0);
let mut left = Rope::from_str(left);
let new_right = operations.apply(&mut left)?;
let new_right = operations.apply(&mut left).unwrap();
assert_eq!(new_right.to_string(), right);
Ok(())
}
#[test]
@ -427,7 +417,7 @@ mod tests {
"hi, my friend!",
);
test_merge_both_ways("hello world", "world !", "hi hello world", "hi world !");
// test_merge_both_ways("hello world", "world !", "hi hello world", "hi world !");
test_merge_both_ways(
"both delete the same word",
@ -436,6 +426,8 @@ mod tests {
"both the same word",
);
test_merge_both_ways(" ", "its utf-8!", " ", "its utf-8!");
test_merge_both_ways(
"both delete the same word but one a bit more",
"both the same word",
@ -464,7 +456,7 @@ mod tests {
let contents = files
.into_iter()
.map(|name| fs::read_to_string(root.join(name)).unwrap())
.map(|text| text[0..50000].to_string())
.map(|text| text[..15000].to_string())
.collect::<Vec<_>>();
contents
@ -482,21 +474,48 @@ mod tests {
}
fn test_merge(original: &str, edit_1: &str, edit_2: &str) -> String {
// println!("Original: {}", original);
println!(
"original: '{:#}'",
original[..100.min(original.len())].to_string()
);
println!(
"edit_1: '{:#}'",
edit_1[..100.min(edit_1.len())].to_string()
);
println!(
"edit_2: '{:#}'",
edit_2[..100.min(edit_2.len())].to_string()
);
let mut original = Rope::from_str(original);
let operations_1 =
OperationSequence::try_from_string_diff(&original.to_string(), edit_1, 1.0).unwrap();
let operations_2 =
OperationSequence::try_from_string_diff(&original.to_string(), edit_2, 1.0).unwrap();
// println!("Operations 1: {:?}", operations_1);
// println!("Operations 2: {:?}", operations_2);
let operations_1 = OperationSequence::from_strings(&original.to_string(), edit_1);
println!(
"operations_1: {:?}",
operations_1.operations[..20.min(operations_1.operations.len())].to_vec()
);
let operations_2 = OperationSequence::from_strings(&original.to_string(), edit_2);
println!(
"operations_2: {:?}",
operations_2.operations[..20.min(operations_2.operations.len())].to_vec()
);
assert_eq!(operations_1.apply(&mut original.clone()).unwrap(), edit_1);
assert_eq!(operations_2.apply(&mut original.clone()).unwrap(), edit_2);
assert_eq!(
operations_1
.apply(&mut original.clone())
.unwrap()
.to_string(),
edit_1
);
assert_eq!(
operations_2
.apply(&mut original.clone())
.unwrap()
.to_string(),
edit_2
);
let merged = operations_1.merge(&operations_2).unwrap();
// println!("Merged: {:?}", merged);
let result = merged.apply(&mut original).unwrap();
result.to_string()

View file

@ -1,6 +1,6 @@
---
source: sync_lib/src/operations/operation.rs
expression: "Operation::create_insert(1, \"hi\").unwrap().unwrap().with_shifted_index(-2)"
source: reconcile/src/operations/operation.rs
expression: "Operation::create_insert(1, \"hi\".to_string()).unwrap().with_shifted_index(-2)"
snapshot_kind: text
---
Err(

View file

@ -0,0 +1,42 @@
---
source: reconcile/src/operations/operation_sequence.rs
expression: operations
snapshot_kind: text
---
OperationSequence {
operations: [
Insert {
index: 0,
text: "Hello, my friend! ",
},
Delete {
index: 18,
deleted_character_count: 13,
deleted_text: Some(
"hello world! ",
),
},
Delete {
index: 26,
deleted_character_count: 5,
deleted_text: Some(
"you? ",
),
},
Delete {
index: 26,
deleted_character_count: 5,
deleted_text: Some(
" Adam",
),
},
Insert {
index: 26,
text: "you ",
},
Insert {
index: 30,
text: "doing? Albert",
},
],
}

View file

@ -0,0 +1,42 @@
---
source: reconcile/src/operations/operation_sequence.rs
expression: result
snapshot_kind: text
---
OperationSequence {
operations: [
Insert {
index: 0,
text: "Hello, my friend! ",
},
Delete {
index: 18,
deleted_character_count: 13,
deleted_text: Some(
"hello world! ",
),
},
Delete {
index: 26,
deleted_character_count: 5,
deleted_text: Some(
"you? ",
),
},
Delete {
index: 26,
deleted_character_count: 5,
deleted_text: Some(
" Adam",
),
},
Insert {
index: 26,
text: "you ",
},
Insert {
index: 30,
text: "doing? Albert",
},
],
}

View file

@ -0,0 +1 @@
pub mod token;

View file

@ -0,0 +1,26 @@
#[derive(Debug, Clone)]
pub struct Token {
pub normalised: String,
pub original: String,
}
impl Token {
pub fn new(normalised: String, original: String) -> Self {
Token {
normalised,
original,
}
}
pub fn tokenize(text: &str) -> Vec<Token> {
text.split_inclusive(|c: char| c.is_whitespace())
.map(|s| Token::new(s.to_string(), s.to_string()))
.collect()
}
}
impl PartialEq for Token {
fn eq(&self, other: &Self) -> bool {
self.normalised == other.normalised
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,25 +0,0 @@
---
source: sync_lib/src/operations/operation_sequence.rs
expression: operations
snapshot_kind: text
---
OperationSequence {
operations: [
Delete {
index: 0,
deleted_character_count: 13,
},
Insert {
index: 0,
text: "Hello, my friend! ",
},
Delete {
index: 26,
deleted_character_count: 10,
},
Insert {
index: 26,
text: "you doing? Albert",
},
],
}

View file

@ -1,11 +0,0 @@
---
source: sync_lib/src/operations/operation_sequence.rs
expression: result
snapshot_kind: text
---
Err(
DiffTooLarge {
diff_ratio: 0.73333335,
diff_ratio_limit: 0.1,
},
)