Add line tokenizer

2025-07-06 13:03:25 +01:00 · 2025-07-06 13:03:25 +01:00 · 469e62106c
commit 469e62106c
parent ee5776c8e1
13 changed files with 324 additions and 50 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,8 +1,13 @@
 //! # Reconcile
 //!
-//! A library for automatically merging two conflicting versions of a
-//! document. `Reconcile` is essentially `git merge` but without any conflict
-//! markers (or lost edits) in the output.
+//! [`diff3`](https://www.gnu.org/software/diffutils/manual/html_node/Invoking-diff3.html) (or `git merge`)
+//! but with automatic conflict resolution.
+//!
+//! Reconcile is a Rust and JavaScript (through WebAssembly) library for merging
+//! text without user intervention. It automatically resolves conflicts that
+//! would typically require user action in traditional 3-way merge tools.
+//!
+//! Try out the [interactive demo](https://schmelczer.dev/reconcile)!
 //!
 //! ```
 //! use reconcile::{reconcile, BuiltinTokenizer};
@ -22,33 +27,17 @@
 //! configurable. By default, words are the atoms for merging and thus words
 //! can't get jumbled up at the end of reconciling.
 //!
-//! ### Word-level tokenization (default)
+//! ### Built-in tokenizers
 //!
 //! ```
 //! use reconcile::{reconcile, BuiltinTokenizer};
 //!
-//! let parent = "The quick brown fox";
-//! let left = "The very quick brown fox";
-//! let right = "The quick red fox";
+//! let parent = "The quick brown fox\n";
+//! let left = "The very quick brown fox\n";
+//! let right = "The quick red fox\n";
 //!
-//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
-//! assert_eq!(result.apply().text(), "The very quick red fox");
-//! ```
-//!
-//! ### Character-level tokenization
-//!
-//! If finer grained merging is required, we can make every UTF-8 character
-//! become its own token:
-//!
-//! ```
-//! use reconcile::{reconcile, BuiltinTokenizer};
-//!
-//! let parent = "Hello";
-//! let left = "Helo";    // deleted 'l'
-//! let right = "Hello!"; // added '!'
-//!
-//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Character);
-//! assert_eq!(result.apply().text(), "Helo!");
+//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Line);
+//! assert_eq!(result.apply().text(), "The quick red foxThe very quick brown fox\n");
 //! ```
 //!
 //! ### Custom tokenization
@ -62,7 +51,12 @@
 //! // Example with custom tokenizer - split by sentences
 //! let sentence_tokenizer = |text: &str| {
 //!     text.split(". ")
-//!         .map(|sentence| Token::new(sentence.to_string(), sentence.to_string(), true, true))
+//!         .map(|sentence| Token::new(
+//!             sentence.to_string(),
+//!             sentence.to_string(),
+//!             false, // don't allow joining token with the preceeding on
+//!             false // don't allow joining token with the following one
+//!         ))
 //!         .collect::<Vec<_>>()
 //! };
 //!
@ -74,6 +68,8 @@
 //! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
 //! assert_eq!(result.apply().text(), "Hello beautiful world. This is a great test.");
 //! ```
+//! > By setting the joinability to `false`, longer runs of inserts with be
+//! > interleaved like LRLRLR instead of LLLRRR.
 //!
 //! ## Cursors and selection ranges
 //!
@ -103,29 +99,8 @@
 //!
 //! ## The algorithm
 //!
-//! The algorithm starts similarly to `diff3`. Its inputs are a **parent**
-//! document and two conflicting versions: `left` and `right` which have
-//! been created from the parent through any series of concurrent edits.
-//!
-//! When calling `reconcile(parent, left, right)`:
-//!
-//! 1. **Diff calculation**: 2-way diffs of (parent & left) and (parent & right)
-//!    are computed using Myers' algorithm
-//! 2. **Tokenization**: The text is split into tokens at the configured
-//!    granularity
-//! 3. **Operation transformation**: The resulting edits are weaved together
-//!    using operational transformation principles, ensuring no changes are lost
-//! 4. **Conflict resolution**: Unlike traditional merge tools, conflicts are
-//!    automatically resolved without producing conflict markers
-//!
-//! The key insight is that both insertions and deletions are preserved:
-//! - If either side inserted text, it appears in the result
-//! - If either side deleted text, the deletion is applied
-//! - Insertions into deleted regions are still preserved
-//!
-//! This approach works well for human-readable text where some "fuzziness" in
-//! conflict resolution is acceptable, unlike source code where precision is
-//! critical.
+//! For a discussion of the algorithm and architecture, see the
+//! [README](README.md#algorithm) page.

 mod operation_transformation;
 mod raw_operation;
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -1,4 +1,5 @@
 mod character_tokenizer;
+mod line_tokenizer;
 mod word_tokenizer;

 use std::ops::Deref;
@ -20,6 +21,7 @@ pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
 #[cfg(feature = "wasm")]
 pub enum BuiltinTokenizer {
    Character = "Character",
+    Line = "Line",
    Word = "Word",
 }

@ -28,6 +30,7 @@ pub enum BuiltinTokenizer {
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub enum BuiltinTokenizer {
    Character,
+    Line,
    Word,
 }

@ -37,6 +40,7 @@ impl Deref for BuiltinTokenizer {
    fn deref(&self) -> &Self::Target {
        match self {
            BuiltinTokenizer::Character => &character_tokenizer::character_tokenizer,
+            BuiltinTokenizer::Line => &line_tokenizer::line_tokenizer,
            BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer,
            #[cfg(feature = "wasm")]
            BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"),
--- a/src/tokenizer/line_tokenizer.rs
+++ b/src/tokenizer/line_tokenizer.rs
@ -0,0 +1,70 @@
+use super::token::Token;
+
+/// Splits text into lines, preserving line endings as separate tokens.
+///
+/// ## Example
+///
+/// ```not_rust
+/// "Hello\nWorld!" -> ["Hello", "\n", "World!"]
+/// "Line 1\r\nLine 2" -> ["Line 1", "\r\n", "Line 2"]
+/// ```
+pub fn line_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result = Vec::new();
+    let mut line_start = 0;
+    
+    let mut chars = text.char_indices().peekable();
+    while let Some((i, c)) = chars.next() {
+        if c == '\n' {
+            // Add line content if any
+            if i > line_start {
+                result.push(text[line_start..i].into());
+            }
+            // Add newline
+            result.push("\n".into());
+            line_start = i + 1;
+        } else if c == '\r' && chars.peek() == Some(&(i + 1, '\n')) {
+            // Handle \r\n
+            if i > line_start {
+                result.push(text[line_start..i].into());
+            }
+            chars.next(); // consume \n
+            result.push("\r\n".into());
+            line_start = i + 2;
+        }
+    }
+    
+    // Add final line if any
+    if line_start < text.len() {
+        result.push(text[line_start..].into());
+    }
+    
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_with_snapshots() {
+        assert_debug_snapshot!(line_tokenizer(""));
+
+        assert_debug_snapshot!(line_tokenizer("Hello"));
+
+        assert_debug_snapshot!(line_tokenizer("Hello\nWorld"));
+
+        assert_debug_snapshot!(line_tokenizer("Hello\nWorld\n"));
+
+        assert_debug_snapshot!(line_tokenizer("Line 1\r\nLine 2"));
+
+        assert_debug_snapshot!(line_tokenizer("Multi\nLine\nText\nHere"));
+
+        assert_debug_snapshot!(line_tokenizer("\n"));
+
+        assert_debug_snapshot!(line_tokenizer("\n\n"));
+
+        assert_debug_snapshot!(line_tokenizer("Start\n\nEnd"));
+    }
+}
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-2.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-2.snap
@ -0,0 +1,13 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Hello\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Hello",
+        original: "Hello",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-3.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-3.snap
@ -0,0 +1,25 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Hello\\nWorld\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Hello",
+        original: "Hello",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "World",
+        original: "World",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-4.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-4.snap
@ -0,0 +1,31 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Hello\\nWorld\\n\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Hello",
+        original: "Hello",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "World",
+        original: "World",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-5.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-5.snap
@ -0,0 +1,25 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Line 1\\r\\nLine 2\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Line 1",
+        original: "Line 1",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\r\n",
+        original: "\r\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "Line 2",
+        original: "Line 2",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-6.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-6.snap
@ -0,0 +1,49 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Multi\\nLine\\nText\\nHere\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Multi",
+        original: "Multi",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "Line",
+        original: "Line",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "Text",
+        original: "Text",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "Here",
+        original: "Here",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-7.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-7.snap
@ -0,0 +1,13 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"\\n\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-8.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-8.snap
@ -0,0 +1,19 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"\\n\\n\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-9.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots-9.snap
@ -0,0 +1,31 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Start\\n\\nEnd\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Start",
+        original: "Start",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "\n",
+        original: "\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalised: "End",
+        original: "End",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots.snap
+++ b/src/tokenizer/snapshots/reconciletokenizerline_tokenizertestswith_snapshots.snap
@ -0,0 +1,6 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"\")"
+snapshot_kind: text
+---
+[]