Fix syncing when network latency is present (#4)

* WIP

* Add debug

* Dedupe inserts

* Add deterministic ordering

* Fix whitespaces

* Update insta

* Add integration test script

* Rename

* Add test

* Working for non-deletes

* omg it mostly works for deletes

* Isdeleted fix

* remove created dates

* update api

* Take document id

* No max attempt

* works

* Use string uuids

* .

* working!!!! (hopefully)

* Improve bundling

* Add module

* lint

* .

* lint

* Fix CI

* use toolchain

* clean up

* Add useSlowFileEvents

* Delete fuzz

* Fix CI

* use docker

* fix script

* clean up

* Clean up

* change node version

* Build docker image on every commit

* fix ci

* 1 db per vault

* Add scritps folder

* Bump versions

* Lint

* .

* Fix tests for real

* Style

* .

* try

* Consistent ordering

* Fix tests

* hmm

* .

* Clean up diff

* Fixes

* .

* Fix version bump

* .

* .

* .
This commit is contained in:
Andras Schmelczer 2025-03-16 20:13:49 +00:00 committed by GitHub
parent bcf48c428d
commit 8b8f1d91d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
91 changed files with 2252 additions and 1586 deletions

View file

@ -38,7 +38,7 @@ use crate::{
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff<T>(old: &[Token<T>], new: &[Token<T>]) -> Vec<RawOperation<T>>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
let max_d = (old.len() + new.len()).div_ceil(2) + 1;
let mut vb = V::new(max_d);
@ -99,7 +99,6 @@ impl IndexMut<isize> for V {
}
}
#[inline(always)]
fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
(range.start..at, at..range.end)
}
@ -124,7 +123,7 @@ fn find_middle_snake<T>(
vb: &mut V,
) -> Option<(usize, usize)>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
let n = old_range.len();
let m = new_range.len();
@ -230,7 +229,7 @@ fn conquer<T>(
vb: &mut V,
result: &mut Vec<RawOperation<T>>,
) where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
// Check for common prefix
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());

View file

@ -3,7 +3,7 @@ use crate::tokenizer::token::Token;
#[derive(Debug, Clone, PartialEq)]
pub enum RawOperation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
Insert(Vec<Token<T>>),
Delete(Vec<Token<T>>),
@ -12,13 +12,13 @@ where
impl<T> RawOperation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn tokens(&self) -> &Vec<Token<T>> {
match self {
RawOperation::Insert(tokens) => tokens,
RawOperation::Delete(tokens) => tokens,
RawOperation::Equal(tokens) => tokens,
RawOperation::Insert(tokens)
| RawOperation::Delete(tokens)
| RawOperation::Equal(tokens) => tokens,
}
}

View file

@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer<F, T>(
tokenizer: &Tokenizer<T>,
) -> String
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
@ -73,7 +73,8 @@ mod test {
"original_1 edit_1 original_3",
);
// One deleted a large range, the other deleted subranges and inserted as well
// One deleted a large range, the other deleted subranges and inserted as
// well
test_merge_both_ways(
"original_1 original_2 original_3 original_4 original_5",
"original_1 original_5",
@ -120,9 +121,6 @@ mod test {
"hi, my friend!",
);
// test_merge_both_ways("hello world", "world !", "hi hello world", "hi world
// !");
test_merge_both_ways(
"both delete the same word",
"both the same word",
@ -147,7 +145,33 @@ mod test {
);
}
#[ignore = "it's too slow"]
#[test]
fn test_reconcile_idempotent_inserts() {
// Both inserted the same prefix; this should get deduped
test_merge_both_ways(
"hi ",
"hi there ",
"hi there my friend ",
"hi there my friend ",
);
// The prefix of the 2nd appears on the 1st so it shouldn't get duplicated
test_merge_both_ways(
"hi ",
"hi there you ",
"hi there my friend ",
"hi there my friend you ",
);
test_merge_both_ways("a", "a b c", "a b c d", "a b c d");
test_merge_both_ways(
" |7ca2b36d-6ee7-49eb-8eb1-d77e4cc1a001| ",
" |7ca2b36d-6ee7-49eb-8eb1-d77e4cc1a001| |cd9195cc-103a-4f13-90c8-4fba0ba421ee| |d39156cc-cfd6-42a8-b70a-75020896069d| |fbad794c-9c47-41f2-a343-490284ecb5a0| |dup| ",
" |7ca2b36d-6ee7-49eb-8eb1-d77e4cc1a001| |cd9195cc-103a-4f13-90c8-4fba0ba421ee| |dup| ",
" |7ca2b36d-6ee7-49eb-8eb1-d77e4cc1a001| |cd9195cc-103a-4f13-90c8-4fba0ba421ee| |d39156cc-cfd6-42a8-b70a-75020896069d| |fbad794c-9c47-41f2-a343-490284ecb5a0| |dup| |dup| ");
}
#[test_matrix( [
"pride_and_prejudice.txt",
"romeo_and_juliet.txt",

View file

@ -25,7 +25,7 @@ use crate::{
#[derive(Debug, Clone, PartialEq, Default)]
pub struct EditedText<'a, T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
text: &'a str,
operations: Vec<OrderedOperation<T>>,
@ -46,7 +46,7 @@ impl<'a> EditedText<'a, String> {
impl<'a, T> EditedText<'a, T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
/// Create an `EditedText` from the given original (old) and updated (new)
/// strings. The returned `EditedText` represents the changes from the
@ -65,7 +65,6 @@ where
Self::new(
original,
// Self::cook_operations(diff),
Self::cook_operations(Self::elongate_operations(diff)).collect(),
)
}
@ -191,7 +190,7 @@ where
pub fn merge(self, other: Self) -> Self {
debug_assert_eq!(
self.text, other.text,
"EditedText-s must be derived from the same text to be mergable"
"`EditedText`-s must be derived from the same text to be mergable"
);
let mut left_merge_context = MergeContext::default();
@ -207,9 +206,21 @@ where
|(operation, _)| {
(
operation.order,
// Operations on left and right must come in the same order so that
// Operations on the left and right must come in the same order so that
// inserts can be merged with other inserts and deletes with deletes.
usize::from(matches!(operation.operation, Operation::Delete { .. })),
// Make sure that the ordering is deterministic regardless which text
// is left or right.
match &operation.operation {
Operation::Insert { text, .. } => text
.iter()
.map(super::super::tokenizer::token::Token::original)
.collect::<String>(),
Operation::Delete {
deleted_character_count,
..
} => deleted_character_count.to_string(),
},
)
},
)
@ -232,6 +243,7 @@ where
}
/// Apply the operations to the text and return the resulting text.
#[must_use]
pub fn apply(&self) -> String {
let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
@ -282,7 +294,7 @@ mod tests {
let original = "hello world! ...";
let left = "Hello world! I'm Andras.";
let right = "Hello world! How are you?";
let expected = "Hello world! I'm Andras.How are you?";
let expected = "Hello world! How are you? I'm Andras.";
let operations_1 = EditedText::from_strings(original, left);
let operations_2 = EditedText::from_strings(original, right);

View file

@ -5,7 +5,7 @@ use crate::operation_transformation::Operation;
#[derive(Clone)]
pub struct MergeContext<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
last_operation: Option<Operation<T>>,
pub shift: i64,
@ -13,7 +13,7 @@ where
impl<T> Default for MergeContext<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn default() -> Self {
MergeContext {
@ -25,7 +25,7 @@ where
impl<T> Debug for MergeContext<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("MergeContext")
@ -37,7 +37,7 @@ where
impl<T> MergeContext<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn last_operation(&self) -> Option<&Operation<T>> { self.last_operation.as_ref() }

View file

@ -1,7 +1,5 @@
use core::{
fmt::{Debug, Display},
ops::Range,
};
use core::fmt::{Debug, Display};
use std::ops::Range;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
@ -9,7 +7,10 @@ use serde::{Deserialize, Serialize};
use super::merge_context::MergeContext;
use crate::{
Token,
utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder},
utils::{
find_longest_prefix_contained_within::find_longest_prefix_contained_within,
string_builder::StringBuilder,
},
};
/// Represents a change that can be applied to a text document.
@ -19,7 +20,7 @@ use crate::{
#[derive(Clone, PartialEq)]
pub enum Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
Insert {
index: usize,
@ -37,7 +38,7 @@ where
impl<T> Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
/// Creates an insert operation with the given index and text.
/// If the text is empty (meaning that the operation would be a no-op),
@ -81,15 +82,8 @@ where
})
}
/// Tries to apply the operation to the given `ropey::Rope` text, returning
/// the modified text.
///
/// # Errors
///
/// Returns a `SyncLibError::OperationApplicationError` if the operation
/// cannot be applied.
///
/// # Panics
/// Applies the operation to the given `StringBuilder`, returning the
/// modified `StringBuilder`.
///
/// When compiled in debug mode, panics if a delete operation is attempted
/// on a range of text that does not match the text to be deleted.
@ -114,7 +108,7 @@ where
builder.delete(self.range());
}
};
}
builder
}
@ -122,8 +116,7 @@ where
/// Returns the index of the first character that the operation affects.
pub fn start_index(&self) -> usize {
match self {
Operation::Insert { index, .. } => *index,
Operation::Delete { index, .. } => *index,
Operation::Insert { index, .. } | Operation::Delete { index, .. } => *index,
}
}
@ -137,6 +130,7 @@ where
}
/// Returns the range of indices of characters that the operation affects.
#[allow(clippy::range_plus_one)]
pub fn range(&self) -> Range<usize> { self.start_index()..self.end_index() + 1 }
/// Returns the number of affected characters. It is always greater than 0
@ -212,17 +206,20 @@ where
..
}),
) => {
let offset_in_tokens = find_common_overlap(previous_inserted_text, &text);
let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens;
let trimmed_length = previous_inserted_text
// In case the current insert's prefix appears in the previously inserted text,
// we can trim the current insert to only include the non-overlapping part.
// This way, we don't end up duplicating text.
let offset_in_tokens =
find_longest_prefix_contained_within(previous_inserted_text, &text);
let offset_in_length = text
.iter()
.skip(offset_in_tokens)
.take(offset_in_tokens)
.map(Token::get_original_length)
.sum::<usize>();
let trimmed_operation =
Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec());
Operation::create_insert(index, text[offset_in_tokens..].to_vec());
affecting_context.shift -= trimmed_length as i64;
affecting_context.shift -= offset_in_length as i64;
produced_context.shift += trimmed_operation
.as_ref()
.map(Operation::len)
@ -297,7 +294,7 @@ where
impl<T> Display for Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
@ -341,7 +338,7 @@ where
impl<T> Debug for Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") }
}
@ -353,7 +350,7 @@ mod tests {
use super::*;
#[test]
#[should_panic]
#[should_panic(expected = "Shifted index must be non-negative")]
fn test_shifting_error() {
insta::assert_debug_snapshot!(
Operation::create_insert(1, vec!["hi".into()])

View file

@ -8,19 +8,19 @@ EditedText {
operations: [
OrderedOperation {
order: 0,
operation: <insert 'Hello, my friend! ' from index 0>,
operation: <insert 'Hello, my friend!' from index 0>,
},
OrderedOperation {
order: 0,
operation: <delete 'hello world! ' from index 18>,
operation: <delete 'hello world!' from index 17>,
},
OrderedOperation {
order: 21,
operation: <insert 'you doing? Albert' from index 26>,
order: 20,
operation: <insert ' you doing? Albert' from index 25>,
},
OrderedOperation {
order: 21,
operation: <delete 'you? Adam' from index 43>,
order: 20,
operation: <delete ' you? Adam' from index 43>,
},
],
}

View file

@ -0,0 +1,6 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\"\")"
snapshot_kind: text
---
[]

View file

@ -0,0 +1,15 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" what? \")"
snapshot_kind: text
---
[
Token {
normalised: "what?",
original: " what?",
},
Token {
normalised: "",
original: " ",
},
]

View file

@ -0,0 +1,23 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: "hello,",
original: " hello,",
},
Token {
normalised: "where",
original: " \nwhere",
},
Token {
normalised: "are",
original: " are",
},
Token {
normalised: "you?",
original: " you?",
},
]

View file

@ -0,0 +1,15 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\"Hi there!\")"
snapshot_kind: text
---
[
Token {
normalised: "Hi",
original: "Hi",
},
Token {
normalised: "there!",
original: " there!",
},
]

View file

@ -8,24 +8,19 @@ use serde::{Deserialize, Serialize};
#[derive(Debug, Clone)]
pub struct Token<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
normalised: T,
original: String,
}
impl From<&str> for Token<String> {
fn from(s: &str) -> Self {
Token {
normalised: s.to_owned(),
original: s.to_owned(),
}
}
fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) }
}
impl<T> Token<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn new(normalised: T, original: String) -> Self {
Token {
@ -43,7 +38,7 @@ where
impl<T> PartialEq for Token<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn eq(&self, other: &Self) -> bool { self.normalised == other.normalised }
}

View file

@ -1,7 +1,48 @@
use super::token::Token;
/// Splits on whitespace keeping the leading whitespace.
///
///
/// ## Example
///
/// "Hi there!" -> ["Hi", " there!"]
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
text.split_inclusive(char::is_whitespace)
.map(|s| Token::new(s.to_owned(), s.to_owned()))
.collect()
let mut result: Vec<Token<String>> = Vec::new();
let mut last_whitespace = 0;
let mut previous_char_is_whitespace = true;
for (i, c) in text.char_indices() {
let is_current_char_whitespace = c.is_whitespace();
if !previous_char_is_whitespace && is_current_char_whitespace {
result.push(text[last_whitespace..i].into());
last_whitespace = i;
}
previous_char_is_whitespace = is_current_char_whitespace;
}
if last_whitespace < text.len() {
result.push(text[last_whitespace..].into());
}
result
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_with_snapshots() {
assert_debug_snapshot!(word_tokenizer("Hi there!"));
assert_debug_snapshot!(word_tokenizer(""));
assert_debug_snapshot!(word_tokenizer(" what? "));
assert_debug_snapshot!(word_tokenizer(" hello, \nwhere are you?"));
}
}

View file

@ -1,6 +1,6 @@
pub mod common_prefix_len;
pub mod common_suffix_len;
pub mod find_common_overlap;
pub mod find_longest_prefix_contained_within;
pub mod merge_iters;
pub mod ordered_operation;
pub mod side;

View file

@ -1,71 +0,0 @@
use crate::Token;
/// Given two lists of tokens, returns the offset in the first (old) list from
/// which the two lists have the same tokens until the end of the first list.
/// Thus, the suffix of the old list from the offset to the end is equal to a
/// prefix of the new list.
///
/// If there is no overlap, the function returns the maxmium offset, the length
/// of the old list.
///
/// ## Example
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [9, 0, 2, 5, 1]
/// ```
/// > results in an offset of 2
pub fn find_common_overlap<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
where
T: PartialEq + Clone,
{
let minimum_offset = old.len().saturating_sub(new.len());
for offset in minimum_offset..old.len() {
if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) {
return offset;
}
}
old.len()
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_common_overlap() {
assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0);
assert_eq!(
find_common_overlap(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into(), "a".into()]
),
1
);
assert_eq!(
find_common_overlap(
&["a".into(), "a".into(), "a".into()],
&["a".into(), "b".into(), "c".into()]
),
2
);
assert_eq!(
find_common_overlap(
&["a".into(), "b".into(), "c".into()],
&["d".into(), "e".into(), "a".into()]
),
3
);
assert_eq!(
find_common_overlap(&["a".into(), "a".into()], &["a".into()]),
1
);
}
}

View file

@ -0,0 +1,103 @@
use crate::Token;
/// Given two lists of tokens, returns `length` where `old` list somewhere
/// within contains the `length` prefix of the `new` list.
///
/// ## Example
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [9, 0, 2, 5, 1]
/// ```
/// > results in an length of 4
///
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [0, 2]
/// ```
/// > results in an length of 2
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [0, 4]
/// ```
/// > results in an length of 1
pub fn find_longest_prefix_contained_within<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
where
T: PartialEq + Clone + std::fmt::Debug,
{
let max_possible = new.len().min(old.len());
for len in (1..=max_possible).rev() {
let prefix = &new[..len];
if old.windows(len).any(|window| window == prefix) {
return len;
}
}
0
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_common_overlap() {
assert_eq!(
find_longest_prefix_contained_within(&["".into()], &["".into()]),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into(), "a".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into()]
),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into(), "b".into(), "a".into()],
&["b".into(), "a".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "a".into(), "a".into()],
&["a".into(), "b".into(), "c".into()]
),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["d".into(), "e".into(), "a".into()]
),
0
);
}
}

View file

@ -46,8 +46,7 @@ where
};
match order {
Some(Ordering::Less) | None => self.left.next(),
Some(Ordering::Equal) => self.left.next(),
Some(Ordering::Less | Ordering::Equal) | None => self.left.next(),
Some(Ordering::Greater) => self.right.next(),
}
}

View file

@ -7,7 +7,7 @@ use crate::operation_transformation::Operation;
#[derive(Debug, Clone, PartialEq)]
pub struct OrderedOperation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
pub order: usize,
pub operation: Operation<T>,