From d7ae0a781df02d5c063fef141b2e421c8f10bc75 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 2 Mar 2025 14:54:57 +0000 Subject: [PATCH] Dedupe inserts --- .../reconcile/src/operation_transformation.rs | 25 ++++- .../src/operation_transformation/operation.rs | 37 ++++--- backend/reconcile/src/utils.rs | 2 +- .../src/utils/find_common_overlap.rs | 71 ------------ .../find_longest_prefix_contained_within.rs | 103 ++++++++++++++++++ 5 files changed, 145 insertions(+), 93 deletions(-) delete mode 100644 backend/reconcile/src/utils/find_common_overlap.rs create mode 100644 backend/reconcile/src/utils/find_longest_prefix_contained_within.rs diff --git a/backend/reconcile/src/operation_transformation.rs b/backend/reconcile/src/operation_transformation.rs index ef9a5e81..1f34fa12 100644 --- a/backend/reconcile/src/operation_transformation.rs +++ b/backend/reconcile/src/operation_transformation.rs @@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer( tokenizer: &Tokenizer, ) -> String where - T: PartialEq + Clone, + T: PartialEq + Clone + std::fmt::Debug, { let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer); let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer); @@ -120,9 +120,6 @@ mod test { "hi, my friend!", ); - // test_merge_both_ways("hello world", "world !", "hi hello world", "hi world - // !"); - test_merge_both_ways( "both delete the same word", "both the same word", @@ -147,7 +144,25 @@ mod test { ); } - #[ignore = "it's too slow"] + #[test] + fn test_reconcile_idempotent_inserts() { + // Both inserted the same prefix; this should get deduped + test_merge_both_ways( + "hi ", + "hi there ", + "hi there my friend", + "hi there my friend", + ); + + // The prefix of the 2nd appears on the 1st so it shouldn't get duplicated + test_merge_both_ways( + "hi ", + "hi there you ", + "hi there my friend", + "hi there you my friend", + ); + } + #[test_matrix( [ "pride_and_prejudice.txt", "romeo_and_juliet.txt", diff --git a/backend/reconcile/src/operation_transformation/operation.rs b/backend/reconcile/src/operation_transformation/operation.rs index c19265b5..5de0141b 100644 --- a/backend/reconcile/src/operation_transformation/operation.rs +++ b/backend/reconcile/src/operation_transformation/operation.rs @@ -2,14 +2,18 @@ use core::{ fmt::{Debug, Display}, ops::Range, }; +use std::cmp::min; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use super::merge_context::MergeContext; use crate::{ + utils::{ + find_longest_prefix_contained_within::find_longest_prefix_contained_within, + string_builder::StringBuilder, + }, Token, - utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder}, }; /// Represents a change that can be applied to a text document. @@ -19,7 +23,7 @@ use crate::{ #[derive(Clone, PartialEq)] pub enum Operation where - T: PartialEq + Clone, + T: PartialEq + Clone + std::fmt::Debug, { Insert { index: usize, @@ -37,7 +41,7 @@ where impl Operation where - T: PartialEq + Clone, + T: PartialEq + Clone + std::fmt::Debug, { /// Creates an insert operation with the given index and text. /// If the text is empty (meaning that the operation would be a no-op), @@ -212,17 +216,20 @@ where .. }), ) => { - let offset_in_tokens = find_common_overlap(previous_inserted_text, &text); - let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens; - let trimmed_length = previous_inserted_text + // In case the current insert's prefix appears in the previously inserted text, + // we can trim the current insert to only include the non-overlapping part. + // This way, we don't end up duplicating text. + let offset_in_tokens = + find_longest_prefix_contained_within(previous_inserted_text, &text); + let offset_in_length = text .iter() - .skip(offset_in_tokens) + .take(offset_in_tokens) .map(Token::get_original_length) .sum::(); let trimmed_operation = - Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec()); + Operation::create_insert(index, text[offset_in_tokens..].to_vec()); - affecting_context.shift -= trimmed_length as i64; + affecting_context.shift -= offset_in_length as i64; produced_context.shift += trimmed_operation .as_ref() .map(Operation::len) @@ -297,7 +304,7 @@ where impl Display for Operation where - T: PartialEq + Clone, + T: PartialEq + Clone + std::fmt::Debug, { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { @@ -341,7 +348,7 @@ where impl Debug for Operation where - T: PartialEq + Clone, + T: PartialEq + Clone + std::fmt::Debug, { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") } } @@ -355,11 +362,9 @@ mod tests { #[test] #[should_panic] fn test_shifting_error() { - insta::assert_debug_snapshot!( - Operation::create_insert(1, vec!["hi".into()]) - .unwrap() - .with_shifted_index(-2) - ); + insta::assert_debug_snapshot!(Operation::create_insert(1, vec!["hi".into()]) + .unwrap() + .with_shifted_index(-2)); } #[test] diff --git a/backend/reconcile/src/utils.rs b/backend/reconcile/src/utils.rs index c0c3c33d..8461b5ff 100644 --- a/backend/reconcile/src/utils.rs +++ b/backend/reconcile/src/utils.rs @@ -1,6 +1,6 @@ pub mod common_prefix_len; pub mod common_suffix_len; -pub mod find_common_overlap; +pub mod find_longest_prefix_contained_within; pub mod merge_iters; pub mod ordered_operation; pub mod side; diff --git a/backend/reconcile/src/utils/find_common_overlap.rs b/backend/reconcile/src/utils/find_common_overlap.rs deleted file mode 100644 index ac586b81..00000000 --- a/backend/reconcile/src/utils/find_common_overlap.rs +++ /dev/null @@ -1,71 +0,0 @@ -use crate::Token; - -/// Given two lists of tokens, returns the offset in the first (old) list from -/// which the two lists have the same tokens until the end of the first list. -/// Thus, the suffix of the old list from the offset to the end is equal to a -/// prefix of the new list. -/// -/// If there is no overlap, the function returns the maxmium offset, the length -/// of the old list. -/// -/// ## Example -/// -/// ```not_rust -/// old: [0, 1, 9, 0, 2, 5] -/// new: [9, 0, 2, 5, 1] -/// ``` -/// > results in an offset of 2 -pub fn find_common_overlap(old: &[Token], new: &[Token]) -> usize -where - T: PartialEq + Clone, -{ - let minimum_offset = old.len().saturating_sub(new.len()); - for offset in minimum_offset..old.len() { - if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) { - return offset; - } - } - - old.len() -} - -#[cfg(test)] -mod tests { - use pretty_assertions::assert_eq; - - use super::*; - - #[test] - fn test_common_overlap() { - assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0); - - assert_eq!( - find_common_overlap( - &["a".into(), "b".into(), "c".into()], - &["b".into(), "c".into(), "a".into()] - ), - 1 - ); - - assert_eq!( - find_common_overlap( - &["a".into(), "a".into(), "a".into()], - &["a".into(), "b".into(), "c".into()] - ), - 2 - ); - - assert_eq!( - find_common_overlap( - &["a".into(), "b".into(), "c".into()], - &["d".into(), "e".into(), "a".into()] - ), - 3 - ); - - assert_eq!( - find_common_overlap(&["a".into(), "a".into()], &["a".into()]), - 1 - ); - } -} diff --git a/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs b/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs new file mode 100644 index 00000000..eb4b8264 --- /dev/null +++ b/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs @@ -0,0 +1,103 @@ +use crate::Token; + +/// Given two lists of tokens, returns `length` where `old` list somewhere +/// within contains the `length` prefix of the `new` list. +/// +/// ## Example +/// +/// ```not_rust +/// old: [0, 1, 9, 0, 2, 5] +/// new: [9, 0, 2, 5, 1] +/// ``` +/// > results in an length of 4 +/// +/// +/// ```not_rust +/// old: [0, 1, 9, 0, 2, 5] +/// new: [0, 2] +/// ``` +/// > results in an length of 2 +/// +/// ```not_rust +/// old: [0, 1, 9, 0, 2, 5] +/// new: [0, 4] +/// ``` +/// > results in an length of 1 +pub fn find_longest_prefix_contained_within(old: &[Token], new: &[Token]) -> usize +where + T: PartialEq + Clone + std::fmt::Debug, +{ + let max_possible = new.len().min(old.len()); + + for len in (1..=max_possible).rev() { + let prefix = &new[..len]; + if old.windows(len).any(|window| window == prefix) { + return len; + } + } + + 0 +} + +#[cfg(test)] +mod tests { + use pretty_assertions::assert_eq; + + use super::*; + + #[test] + fn test_common_overlap() { + assert_eq!( + find_longest_prefix_contained_within(&["".into()], &["".into()]), + 1 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "b".into(), "c".into()], + &["b".into(), "c".into(), "a".into()] + ), + 2 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "b".into(), "c".into()], + &["b".into(), "c".into()] + ), + 2 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "b".into(), "c".into()], + &["b".into()] + ), + 1 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "b".into(), "c".into(), "b".into(), "a".into()], + &["b".into(), "a".into()] + ), + 2 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "a".into(), "a".into()], + &["a".into(), "b".into(), "c".into()] + ), + 1 + ); + + assert_eq!( + find_longest_prefix_contained_within( + &["a".into(), "b".into(), "c".into()], + &["d".into(), "e".into(), "a".into()] + ), + 0 + ); + } +}