Dedupe inserts

2025-03-02 14:54:57 +00:00 · 2025-03-02 14:54:57 +00:00 · d7ae0a781d
commit d7ae0a781d
parent a93c17711c
5 changed files with 145 additions and 93 deletions
--- a/backend/reconcile/src/operation_transformation.rs
+++ b/backend/reconcile/src/operation_transformation.rs
@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer<F, T>(
    tokenizer: &Tokenizer<T>,
 ) -> String
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
    let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
@ -120,9 +120,6 @@ mod test {
            "hi, my friend!",
        );

-        // test_merge_both_ways("hello world", "world !", "hi hello world", "hi world
-        // !");
-
        test_merge_both_ways(
            "both delete the same word",
            "both the same word",
@ -147,7 +144,25 @@ mod test {
        );
    }

-    #[ignore = "it's too slow"]
+    #[test]
+    fn test_reconcile_idempotent_inserts() {
+        // Both inserted the same prefix; this should get deduped
+        test_merge_both_ways(
+            "hi ",
+            "hi there ",
+            "hi there my friend",
+            "hi there my friend",
+        );
+
+        // The prefix of the 2nd appears on the 1st so it shouldn't get duplicated
+        test_merge_both_ways(
+            "hi ",
+            "hi there you ",
+            "hi there my friend",
+            "hi there you my friend",
+        );
+    }
+
    #[test_matrix( [
        "pride_and_prejudice.txt",
        "romeo_and_juliet.txt",
--- a/backend/reconcile/src/operation_transformation/operation.rs
+++ b/backend/reconcile/src/operation_transformation/operation.rs
@ -2,14 +2,18 @@ use core::{
    fmt::{Debug, Display},
    ops::Range,
 };
+use std::cmp::min;

 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};

 use super::merge_context::MergeContext;
 use crate::{
+    utils::{
+        find_longest_prefix_contained_within::find_longest_prefix_contained_within,
+        string_builder::StringBuilder,
+    },
    Token,
-    utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder},
 };

 /// Represents a change that can be applied to a text document.
@ -19,7 +23,7 @@ use crate::{
 #[derive(Clone, PartialEq)]
 pub enum Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    Insert {
        index: usize,
@ -37,7 +41,7 @@ where

 impl<T> Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    /// Creates an insert operation with the given index and text.
    /// If the text is empty (meaning that the operation would be a no-op),
@ -212,17 +216,20 @@ where
                    ..
                }),
            ) => {
-                let offset_in_tokens = find_common_overlap(previous_inserted_text, &text);
-                let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens;
-                let trimmed_length = previous_inserted_text
+                // In case the current insert's prefix appears in the previously inserted text,
+                // we can trim the current insert to only include the non-overlapping part.
+                // This way, we don't end up duplicating text.
+                let offset_in_tokens =
+                    find_longest_prefix_contained_within(previous_inserted_text, &text);
+                let offset_in_length = text
                    .iter()
-                    .skip(offset_in_tokens)
+                    .take(offset_in_tokens)
                    .map(Token::get_original_length)
                    .sum::<usize>();
                let trimmed_operation =
-                    Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec());
+                    Operation::create_insert(index, text[offset_in_tokens..].to_vec());

-                affecting_context.shift -= trimmed_length as i64;
+                affecting_context.shift -= offset_in_length as i64;
                produced_context.shift += trimmed_operation
                    .as_ref()
                    .map(Operation::len)
@ -297,7 +304,7 @@ where

 impl<T> Display for Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
@ -341,7 +348,7 @@ where

 impl<T> Debug for Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") }
 }
@ -355,11 +362,9 @@ mod tests {
    #[test]
    #[should_panic]
    fn test_shifting_error() {
-        insta::assert_debug_snapshot!(
-            Operation::create_insert(1, vec!["hi".into()])
+        insta::assert_debug_snapshot!(Operation::create_insert(1, vec!["hi".into()])
            .unwrap()
-                .with_shifted_index(-2)
-        );
+            .with_shifted_index(-2));
    }

    #[test]
--- a/backend/reconcile/src/utils.rs
+++ b/backend/reconcile/src/utils.rs
@ -1,6 +1,6 @@
 pub mod common_prefix_len;
 pub mod common_suffix_len;
-pub mod find_common_overlap;
+pub mod find_longest_prefix_contained_within;
 pub mod merge_iters;
 pub mod ordered_operation;
 pub mod side;
--- a/backend/reconcile/src/utils/find_common_overlap.rs
+++ b/backend/reconcile/src/utils/find_common_overlap.rs
@ -1,71 +0,0 @@
-use crate::Token;
-
-/// Given two lists of tokens, returns the offset in the first (old) list from
-/// which the two lists have the same tokens until the end of the first list.
-/// Thus, the suffix of the old list from the offset to the end is equal to a
-/// prefix of the new list.
-///
-/// If there is no overlap, the function returns the maxmium offset, the length
-/// of the old list.
-///
-/// ## Example
-///
-/// ```not_rust
-/// old: [0, 1, 9, 0, 2, 5]
-/// new:       [9, 0, 2, 5, 1]
-/// ```
-/// > results in an offset of 2
-pub fn find_common_overlap<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
-where
-    T: PartialEq + Clone,
-{
-    let minimum_offset = old.len().saturating_sub(new.len());
-    for offset in minimum_offset..old.len() {
-        if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) {
-            return offset;
-        }
-    }
-
-    old.len()
-}
-
-#[cfg(test)]
-mod tests {
-    use pretty_assertions::assert_eq;
-
-    use super::*;
-
-    #[test]
-    fn test_common_overlap() {
-        assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0);
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "b".into(), "c".into()],
-                &["b".into(), "c".into(), "a".into()]
-            ),
-            1
-        );
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "a".into(), "a".into()],
-                &["a".into(), "b".into(), "c".into()]
-            ),
-            2
-        );
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "b".into(), "c".into()],
-                &["d".into(), "e".into(), "a".into()]
-            ),
-            3
-        );
-
-        assert_eq!(
-            find_common_overlap(&["a".into(), "a".into()], &["a".into()]),
-            1
-        );
-    }
-}
--- a/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs
+++ b/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs
@ -0,0 +1,103 @@
+use crate::Token;
+
+/// Given two lists of tokens, returns `length` where `old` list somewhere
+/// within contains the `length` prefix of the `new` list.
+///
+/// ## Example
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:       [9, 0, 2, 5, 1]
+/// ```
+/// > results in an length of 4
+///
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:          [0, 2]
+/// ```
+/// > results in an length of 2
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:          [0, 4]
+/// ```
+/// > results in an length of 1
+pub fn find_longest_prefix_contained_within<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
+where
+    T: PartialEq + Clone + std::fmt::Debug,
+{
+    let max_possible = new.len().min(old.len());
+
+    for len in (1..=max_possible).rev() {
+        let prefix = &new[..len];
+        if old.windows(len).any(|window| window == prefix) {
+            return len;
+        }
+    }
+
+    0
+}
+
+#[cfg(test)]
+mod tests {
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_common_overlap() {
+        assert_eq!(
+            find_longest_prefix_contained_within(&["".into()], &["".into()]),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into(), "c".into(), "a".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into(), "c".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into()]
+            ),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into(), "b".into(), "a".into()],
+                &["b".into(), "a".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "a".into(), "a".into()],
+                &["a".into(), "b".into(), "c".into()]
+            ),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["d".into(), "e".into(), "a".into()]
+            ),
+            0
+        );
+    }
+}