From d7ae0a781df02d5c063fef141b2e421c8f10bc75 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Sun, 2 Mar 2025 14:54:57 +0000
Subject: [PATCH] Dedupe inserts

---
 .../reconcile/src/operation_transformation.rs |  25 ++++-
 .../src/operation_transformation/operation.rs |  37 ++++---
 backend/reconcile/src/utils.rs                |   2 +-
 .../src/utils/find_common_overlap.rs          |  71 ------------
 .../find_longest_prefix_contained_within.rs   | 103 ++++++++++++++++++
 5 files changed, 145 insertions(+), 93 deletions(-)
 delete mode 100644 backend/reconcile/src/utils/find_common_overlap.rs
 create mode 100644 backend/reconcile/src/utils/find_longest_prefix_contained_within.rs
diff --git a/backend/reconcile/src/operation_transformation.rs b/backend/reconcile/src/operation_transformation.rs
index ef9a5e81..1f34fa12 100644
--- a/backend/reconcile/src/operation_transformation.rs
+++ b/backend/reconcile/src/operation_transformation.rs
@@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer<F, T>(
     tokenizer: &Tokenizer<T>,
 ) -> String
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
     let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
     let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
@@ -120,9 +120,6 @@ mod test {
             "hi, my friend!",
         );
 
-        // test_merge_both_ways("hello world", "world !", "hi hello world", "hi world
-        // !");
-
         test_merge_both_ways(
             "both delete the same word",
             "both the same word",
@@ -147,7 +144,25 @@ mod test {
         );
     }
 
-    #[ignore = "it's too slow"]
+    #[test]
+    fn test_reconcile_idempotent_inserts() {
+        // Both inserted the same prefix; this should get deduped
+        test_merge_both_ways(
+            "hi ",
+            "hi there ",
+            "hi there my friend",
+            "hi there my friend",
+        );
+
+        // The prefix of the 2nd appears on the 1st so it shouldn't get duplicated
+        test_merge_both_ways(
+            "hi ",
+            "hi there you ",
+            "hi there my friend",
+            "hi there you my friend",
+        );
+    }
+
     #[test_matrix( [
         "pride_and_prejudice.txt",
         "romeo_and_juliet.txt",
diff --git a/backend/reconcile/src/operation_transformation/operation.rs b/backend/reconcile/src/operation_transformation/operation.rs
index c19265b5..5de0141b 100644
--- a/backend/reconcile/src/operation_transformation/operation.rs
+++ b/backend/reconcile/src/operation_transformation/operation.rs
@@ -2,14 +2,18 @@ use core::{
     fmt::{Debug, Display},
     ops::Range,
 };
+use std::cmp::min;
 
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 
 use super::merge_context::MergeContext;
 use crate::{
+    utils::{
+        find_longest_prefix_contained_within::find_longest_prefix_contained_within,
+        string_builder::StringBuilder,
+    },
     Token,
-    utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder},
 };
 
 /// Represents a change that can be applied to a text document.
@@ -19,7 +23,7 @@ use crate::{
 #[derive(Clone, PartialEq)]
 pub enum Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
     Insert {
         index: usize,
@@ -37,7 +41,7 @@ where
 
 impl<T> Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
     /// Creates an insert operation with the given index and text.
     /// If the text is empty (meaning that the operation would be a no-op),
@@ -212,17 +216,20 @@ where
                     ..
                 }),
             ) => {
-                let offset_in_tokens = find_common_overlap(previous_inserted_text, &text);
-                let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens;
-                let trimmed_length = previous_inserted_text
+                // In case the current insert's prefix appears in the previously inserted text,
+                // we can trim the current insert to only include the non-overlapping part.
+                // This way, we don't end up duplicating text.
+                let offset_in_tokens =
+                    find_longest_prefix_contained_within(previous_inserted_text, &text);
+                let offset_in_length = text
                     .iter()
-                    .skip(offset_in_tokens)
+                    .take(offset_in_tokens)
                     .map(Token::get_original_length)
                     .sum::<usize>();
                 let trimmed_operation =
-                    Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec());
+                    Operation::create_insert(index, text[offset_in_tokens..].to_vec());
 
-                affecting_context.shift -= trimmed_length as i64;
+                affecting_context.shift -= offset_in_length as i64;
                 produced_context.shift += trimmed_operation
                     .as_ref()
                     .map(Operation::len)
@@ -297,7 +304,7 @@ where
 
 impl<T> Display for Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         match self {
@@ -341,7 +348,7 @@ where
 
 impl<T> Debug for Operation<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") }
 }
@@ -355,11 +362,9 @@ mod tests {
     #[test]
     #[should_panic]
     fn test_shifting_error() {
-        insta::assert_debug_snapshot!(
-            Operation::create_insert(1, vec!["hi".into()])
-                .unwrap()
-                .with_shifted_index(-2)
-        );
+        insta::assert_debug_snapshot!(Operation::create_insert(1, vec!["hi".into()])
+            .unwrap()
+            .with_shifted_index(-2));
     }
 
     #[test]
diff --git a/backend/reconcile/src/utils.rs b/backend/reconcile/src/utils.rs
index c0c3c33d..8461b5ff 100644
--- a/backend/reconcile/src/utils.rs
+++ b/backend/reconcile/src/utils.rs
@@ -1,6 +1,6 @@
 pub mod common_prefix_len;
 pub mod common_suffix_len;
-pub mod find_common_overlap;
+pub mod find_longest_prefix_contained_within;
 pub mod merge_iters;
 pub mod ordered_operation;
 pub mod side;
diff --git a/backend/reconcile/src/utils/find_common_overlap.rs b/backend/reconcile/src/utils/find_common_overlap.rs
deleted file mode 100644
index ac586b81..00000000
--- a/backend/reconcile/src/utils/find_common_overlap.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-use crate::Token;
-
-/// Given two lists of tokens, returns the offset in the first (old) list from
-/// which the two lists have the same tokens until the end of the first list.
-/// Thus, the suffix of the old list from the offset to the end is equal to a
-/// prefix of the new list.
-///
-/// If there is no overlap, the function returns the maxmium offset, the length
-/// of the old list.
-///
-/// ## Example
-///
-/// ```not_rust
-/// old: [0, 1, 9, 0, 2, 5]
-/// new:       [9, 0, 2, 5, 1]
-/// ```
-/// > results in an offset of 2
-pub fn find_common_overlap<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
-where
-    T: PartialEq + Clone,
-{
-    let minimum_offset = old.len().saturating_sub(new.len());
-    for offset in minimum_offset..old.len() {
-        if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) {
-            return offset;
-        }
-    }
-
-    old.len()
-}
-
-#[cfg(test)]
-mod tests {
-    use pretty_assertions::assert_eq;
-
-    use super::*;
-
-    #[test]
-    fn test_common_overlap() {
-        assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0);
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "b".into(), "c".into()],
-                &["b".into(), "c".into(), "a".into()]
-            ),
-            1
-        );
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "a".into(), "a".into()],
-                &["a".into(), "b".into(), "c".into()]
-            ),
-            2
-        );
-
-        assert_eq!(
-            find_common_overlap(
-                &["a".into(), "b".into(), "c".into()],
-                &["d".into(), "e".into(), "a".into()]
-            ),
-            3
-        );
-
-        assert_eq!(
-            find_common_overlap(&["a".into(), "a".into()], &["a".into()]),
-            1
-        );
-    }
-}
diff --git a/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs b/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs
new file mode 100644
index 00000000..eb4b8264
--- /dev/null
+++ b/backend/reconcile/src/utils/find_longest_prefix_contained_within.rs
@@ -0,0 +1,103 @@
+use crate::Token;
+
+/// Given two lists of tokens, returns `length` where `old` list somewhere
+/// within contains the `length` prefix of the `new` list.
+///
+/// ## Example
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:       [9, 0, 2, 5, 1]
+/// ```
+/// > results in an length of 4
+///
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:          [0, 2]
+/// ```
+/// > results in an length of 2
+///
+/// ```not_rust
+/// old: [0, 1, 9, 0, 2, 5]
+/// new:          [0, 4]
+/// ```
+/// > results in an length of 1
+pub fn find_longest_prefix_contained_within<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
+where
+    T: PartialEq + Clone + std::fmt::Debug,
+{
+    let max_possible = new.len().min(old.len());
+
+    for len in (1..=max_possible).rev() {
+        let prefix = &new[..len];
+        if old.windows(len).any(|window| window == prefix) {
+            return len;
+        }
+    }
+
+    0
+}
+
+#[cfg(test)]
+mod tests {
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_common_overlap() {
+        assert_eq!(
+            find_longest_prefix_contained_within(&["".into()], &["".into()]),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into(), "c".into(), "a".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into(), "c".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["b".into()]
+            ),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into(), "b".into(), "a".into()],
+                &["b".into(), "a".into()]
+            ),
+            2
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "a".into(), "a".into()],
+                &["a".into(), "b".into(), "c".into()]
+            ),
+            1
+        );
+
+        assert_eq!(
+            find_longest_prefix_contained_within(
+                &["a".into(), "b".into(), "c".into()],
+                &["d".into(), "e".into(), "a".into()]
+            ),
+            0
+        );
+    }
+}