diff --git a/reconcile-js/src/index.ts b/reconcile-js/src/index.ts index 9f0bbcc..f209bcf 100644 --- a/reconcile-js/src/index.ts +++ b/reconcile-js/src/index.ts @@ -39,13 +39,18 @@ export interface SpanWithHistory { history: History; } -export type Tokenizer = "word" | "character"; +export type Tokenizer = "Line" | "Word" | "Character"; +const TOKENIZERS = ["Line", "Word", "Character"]; let isInitialised = false; const UNINITIALISED_MODULE_ERROR = "Reconcile module has not been initialized. Please call init() before using any other functions."; +const UNSUPPORTED_TOKENIZER_ERROR = `Unsupported tokenizer. Only ${TOKENIZERS.join( + ", " +)} are supported.`; + /** * Initializes the WASM module for text reconciliation. * Must be called before using any other functions. @@ -84,6 +89,10 @@ export function reconcile( throw new Error(UNINITIALISED_MODULE_ERROR); } + if (!TOKENIZERS.includes(tokenizer)) { + throw new Error(UNSUPPORTED_TOKENIZER_ERROR); + } + const leftCursor = toWasmTextWithCursors(left); const rightCursor = toWasmTextWithCursors(right); @@ -119,6 +128,10 @@ export function reconcileWithHistory( throw new Error(UNINITIALISED_MODULE_ERROR); } + if (!TOKENIZERS.includes(tokenizer)) { + throw new Error(UNSUPPORTED_TOKENIZER_ERROR); + } + const leftCursor = toWasmTextWithCursors(left); const rightCursor = toWasmTextWithCursors(right); diff --git a/src/lib.rs b/src/lib.rs index 5977dbf..71d837f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,13 @@ //! # Reconcile //! -//! A library for automatically merging two conflicting versions of a -//! document. `Reconcile` is essentially `git merge` but without any conflict -//! markers (or lost edits) in the output. +//! [`diff3`](https://www.gnu.org/software/diffutils/manual/html_node/Invoking-diff3.html) (or `git merge`) +//! but with automatic conflict resolution. +//! +//! Reconcile is a Rust and JavaScript (through WebAssembly) library for merging +//! text without user intervention. It automatically resolves conflicts that +//! would typically require user action in traditional 3-way merge tools. +//! +//! Try out the [interactive demo](https://schmelczer.dev/reconcile)! //! //! ``` //! use reconcile::{reconcile, BuiltinTokenizer}; @@ -22,33 +27,17 @@ //! configurable. By default, words are the atoms for merging and thus words //! can't get jumbled up at the end of reconciling. //! -//! ### Word-level tokenization (default) +//! ### Built-in tokenizers //! //! ``` //! use reconcile::{reconcile, BuiltinTokenizer}; //! -//! let parent = "The quick brown fox"; -//! let left = "The very quick brown fox"; -//! let right = "The quick red fox"; +//! let parent = "The quick brown fox\n"; +//! let left = "The very quick brown fox\n"; +//! let right = "The quick red fox\n"; //! -//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word); -//! assert_eq!(result.apply().text(), "The very quick red fox"); -//! ``` -//! -//! ### Character-level tokenization -//! -//! If finer grained merging is required, we can make every UTF-8 character -//! become its own token: -//! -//! ``` -//! use reconcile::{reconcile, BuiltinTokenizer}; -//! -//! let parent = "Hello"; -//! let left = "Helo"; // deleted 'l' -//! let right = "Hello!"; // added '!' -//! -//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Character); -//! assert_eq!(result.apply().text(), "Helo!"); +//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Line); +//! assert_eq!(result.apply().text(), "The quick red foxThe very quick brown fox\n"); //! ``` //! //! ### Custom tokenization @@ -62,7 +51,12 @@ //! // Example with custom tokenizer - split by sentences //! let sentence_tokenizer = |text: &str| { //! text.split(". ") -//! .map(|sentence| Token::new(sentence.to_string(), sentence.to_string(), true, true)) +//! .map(|sentence| Token::new( +//! sentence.to_string(), +//! sentence.to_string(), +//! false, // don't allow joining token with the preceeding on +//! false // don't allow joining token with the following one +//! )) //! .collect::>() //! }; //! @@ -74,6 +68,8 @@ //! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word); //! assert_eq!(result.apply().text(), "Hello beautiful world. This is a great test."); //! ``` +//! > By setting the joinability to `false`, longer runs of inserts with be +//! > interleaved like LRLRLR instead of LLLRRR. //! //! ## Cursors and selection ranges //! @@ -103,29 +99,8 @@ //! //! ## The algorithm //! -//! The algorithm starts similarly to `diff3`. Its inputs are a **parent** -//! document and two conflicting versions: `left` and `right` which have -//! been created from the parent through any series of concurrent edits. -//! -//! When calling `reconcile(parent, left, right)`: -//! -//! 1. **Diff calculation**: 2-way diffs of (parent & left) and (parent & right) -//! are computed using Myers' algorithm -//! 2. **Tokenization**: The text is split into tokens at the configured -//! granularity -//! 3. **Operation transformation**: The resulting edits are weaved together -//! using operational transformation principles, ensuring no changes are lost -//! 4. **Conflict resolution**: Unlike traditional merge tools, conflicts are -//! automatically resolved without producing conflict markers -//! -//! The key insight is that both insertions and deletions are preserved: -//! - If either side inserted text, it appears in the result -//! - If either side deleted text, the deletion is applied -//! - Insertions into deleted regions are still preserved -//! -//! This approach works well for human-readable text where some "fuzziness" in -//! conflict resolution is acceptable, unlike source code where precision is -//! critical. +//! For a discussion of the algorithm and architecture, see the +//! [README](README.md#algorithm) page. mod operation_transformation; mod raw_operation; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b8c8e0f..62ab528 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,4 +1,5 @@ mod character_tokenizer; +mod line_tokenizer; mod word_tokenizer; use std::ops::Deref; @@ -20,6 +21,7 @@ pub type Tokenizer = dyn Fn(&str) -> Vec>; #[cfg(feature = "wasm")] pub enum BuiltinTokenizer { Character = "Character", + Line = "Line", Word = "Word", } @@ -28,6 +30,7 @@ pub enum BuiltinTokenizer { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum BuiltinTokenizer { Character, + Line, Word, } @@ -37,6 +40,7 @@ impl Deref for BuiltinTokenizer { fn deref(&self) -> &Self::Target { match self { BuiltinTokenizer::Character => &character_tokenizer::character_tokenizer, + BuiltinTokenizer::Line => &line_tokenizer::line_tokenizer, BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer, #[cfg(feature = "wasm")] BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"), diff --git a/src/tokenizer/line_tokenizer.rs b/src/tokenizer/line_tokenizer.rs new file mode 100644 index 0000000..ed283c3 --- /dev/null +++ b/src/tokenizer/line_tokenizer.rs @@ -0,0 +1,70 @@ +use super::token::Token; + +/// Splits text into lines, preserving line endings as separate tokens. +/// +/// ## Example +/// +/// ```not_rust +/// "Hello\nWorld!" -> ["Hello", "\n", "World!"] +/// "Line 1\r\nLine 2" -> ["Line 1", "\r\n", "Line 2"] +/// ``` +pub fn line_tokenizer(text: &str) -> Vec> { + let mut result = Vec::new(); + let mut line_start = 0; + + let mut chars = text.char_indices().peekable(); + while let Some((i, c)) = chars.next() { + if c == '\n' { + // Add line content if any + if i > line_start { + result.push(text[line_start..i].into()); + } + // Add newline + result.push("\n".into()); + line_start = i + 1; + } else if c == '\r' && chars.peek() == Some(&(i + 1, '\n')) { + // Handle \r\n + if i > line_start { + result.push(text[line_start..i].into()); + } + chars.next(); // consume \n + result.push("\r\n".into()); + line_start = i + 2; + } + } + + // Add final line if any + if line_start < text.len() { + result.push(text[line_start..].into()); + } + + result +} + +#[cfg(test)] +mod tests { + use insta::assert_debug_snapshot; + + use super::*; + + #[test] + fn test_with_snapshots() { + assert_debug_snapshot!(line_tokenizer("")); + + assert_debug_snapshot!(line_tokenizer("Hello")); + + assert_debug_snapshot!(line_tokenizer("Hello\nWorld")); + + assert_debug_snapshot!(line_tokenizer("Hello\nWorld\n")); + + assert_debug_snapshot!(line_tokenizer("Line 1\r\nLine 2")); + + assert_debug_snapshot!(line_tokenizer("Multi\nLine\nText\nHere")); + + assert_debug_snapshot!(line_tokenizer("\n")); + + assert_debug_snapshot!(line_tokenizer("\n\n")); + + assert_debug_snapshot!(line_tokenizer("Start\n\nEnd")); + } +} diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-2.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-2.snap new file mode 100644 index 0000000..ec1c89e --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-2.snap @@ -0,0 +1,13 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Hello\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Hello", + original: "Hello", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-3.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-3.snap new file mode 100644 index 0000000..c45029a --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-3.snap @@ -0,0 +1,25 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Hello\\nWorld\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Hello", + original: "Hello", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "World", + original: "World", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-4.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-4.snap new file mode 100644 index 0000000..ad8cf81 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-4.snap @@ -0,0 +1,31 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Hello\\nWorld\\n\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Hello", + original: "Hello", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "World", + original: "World", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-5.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-5.snap new file mode 100644 index 0000000..ef1f9cb --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-5.snap @@ -0,0 +1,25 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Line 1\\r\\nLine 2\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Line 1", + original: "Line 1", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\r\n", + original: "\r\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "Line 2", + original: "Line 2", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-6.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-6.snap new file mode 100644 index 0000000..5edb790 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-6.snap @@ -0,0 +1,49 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Multi\\nLine\\nText\\nHere\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Multi", + original: "Multi", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "Line", + original: "Line", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "Text", + original: "Text", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "Here", + original: "Here", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-7.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-7.snap new file mode 100644 index 0000000..8dcdba8 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-7.snap @@ -0,0 +1,13 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"\\n\")" +snapshot_kind: text +--- +[ + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-8.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-8.snap new file mode 100644 index 0000000..8466643 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-8.snap @@ -0,0 +1,19 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"\\n\\n\")" +snapshot_kind: text +--- +[ + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-9.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-9.snap new file mode 100644 index 0000000..9c2be98 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots-9.snap @@ -0,0 +1,31 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Start\\n\\nEnd\")" +snapshot_kind: text +--- +[ + Token { + normalised: "Start", + original: "Start", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "\n", + original: "\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalised: "End", + original: "End", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots.snap b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots.snap new file mode 100644 index 0000000..a525ea3 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile__tokenizer__line_tokenizer__tests__with_snapshots.snap @@ -0,0 +1,6 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"\")" +snapshot_kind: text +--- +[]