Add line tokenizer

This commit is contained in:
Andras Schmelczer 2025-07-06 13:03:25 +01:00
parent ee5776c8e1
commit 469e62106c
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
13 changed files with 324 additions and 50 deletions

View file

@ -1,8 +1,13 @@
//! # Reconcile
//!
//! A library for automatically merging two conflicting versions of a
//! document. `Reconcile` is essentially `git merge` but without any conflict
//! markers (or lost edits) in the output.
//! [`diff3`](https://www.gnu.org/software/diffutils/manual/html_node/Invoking-diff3.html) (or `git merge`)
//! but with automatic conflict resolution.
//!
//! Reconcile is a Rust and JavaScript (through WebAssembly) library for merging
//! text without user intervention. It automatically resolves conflicts that
//! would typically require user action in traditional 3-way merge tools.
//!
//! Try out the [interactive demo](https://schmelczer.dev/reconcile)!
//!
//! ```
//! use reconcile::{reconcile, BuiltinTokenizer};
@ -22,33 +27,17 @@
//! configurable. By default, words are the atoms for merging and thus words
//! can't get jumbled up at the end of reconciling.
//!
//! ### Word-level tokenization (default)
//! ### Built-in tokenizers
//!
//! ```
//! use reconcile::{reconcile, BuiltinTokenizer};
//!
//! let parent = "The quick brown fox";
//! let left = "The very quick brown fox";
//! let right = "The quick red fox";
//! let parent = "The quick brown fox\n";
//! let left = "The very quick brown fox\n";
//! let right = "The quick red fox\n";
//!
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
//! assert_eq!(result.apply().text(), "The very quick red fox");
//! ```
//!
//! ### Character-level tokenization
//!
//! If finer grained merging is required, we can make every UTF-8 character
//! become its own token:
//!
//! ```
//! use reconcile::{reconcile, BuiltinTokenizer};
//!
//! let parent = "Hello";
//! let left = "Helo"; // deleted 'l'
//! let right = "Hello!"; // added '!'
//!
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Character);
//! assert_eq!(result.apply().text(), "Helo!");
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Line);
//! assert_eq!(result.apply().text(), "The quick red foxThe very quick brown fox\n");
//! ```
//!
//! ### Custom tokenization
@ -62,7 +51,12 @@
//! // Example with custom tokenizer - split by sentences
//! let sentence_tokenizer = |text: &str| {
//! text.split(". ")
//! .map(|sentence| Token::new(sentence.to_string(), sentence.to_string(), true, true))
//! .map(|sentence| Token::new(
//! sentence.to_string(),
//! sentence.to_string(),
//! false, // don't allow joining token with the preceeding on
//! false // don't allow joining token with the following one
//! ))
//! .collect::<Vec<_>>()
//! };
//!
@ -74,6 +68,8 @@
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
//! assert_eq!(result.apply().text(), "Hello beautiful world. This is a great test.");
//! ```
//! > By setting the joinability to `false`, longer runs of inserts with be
//! > interleaved like LRLRLR instead of LLLRRR.
//!
//! ## Cursors and selection ranges
//!
@ -103,29 +99,8 @@
//!
//! ## The algorithm
//!
//! The algorithm starts similarly to `diff3`. Its inputs are a **parent**
//! document and two conflicting versions: `left` and `right` which have
//! been created from the parent through any series of concurrent edits.
//!
//! When calling `reconcile(parent, left, right)`:
//!
//! 1. **Diff calculation**: 2-way diffs of (parent & left) and (parent & right)
//! are computed using Myers' algorithm
//! 2. **Tokenization**: The text is split into tokens at the configured
//! granularity
//! 3. **Operation transformation**: The resulting edits are weaved together
//! using operational transformation principles, ensuring no changes are lost
//! 4. **Conflict resolution**: Unlike traditional merge tools, conflicts are
//! automatically resolved without producing conflict markers
//!
//! The key insight is that both insertions and deletions are preserved:
//! - If either side inserted text, it appears in the result
//! - If either side deleted text, the deletion is applied
//! - Insertions into deleted regions are still preserved
//!
//! This approach works well for human-readable text where some "fuzziness" in
//! conflict resolution is acceptable, unlike source code where precision is
//! critical.
//! For a discussion of the algorithm and architecture, see the
//! [README](README.md#algorithm) page.
mod operation_transformation;
mod raw_operation;

View file

@ -1,4 +1,5 @@
mod character_tokenizer;
mod line_tokenizer;
mod word_tokenizer;
use std::ops::Deref;
@ -20,6 +21,7 @@ pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
#[cfg(feature = "wasm")]
pub enum BuiltinTokenizer {
Character = "Character",
Line = "Line",
Word = "Word",
}
@ -28,6 +30,7 @@ pub enum BuiltinTokenizer {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum BuiltinTokenizer {
Character,
Line,
Word,
}
@ -37,6 +40,7 @@ impl Deref for BuiltinTokenizer {
fn deref(&self) -> &Self::Target {
match self {
BuiltinTokenizer::Character => &character_tokenizer::character_tokenizer,
BuiltinTokenizer::Line => &line_tokenizer::line_tokenizer,
BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer,
#[cfg(feature = "wasm")]
BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"),

View file

@ -0,0 +1,70 @@
use super::token::Token;
/// Splits text into lines, preserving line endings as separate tokens.
///
/// ## Example
///
/// ```not_rust
/// "Hello\nWorld!" -> ["Hello", "\n", "World!"]
/// "Line 1\r\nLine 2" -> ["Line 1", "\r\n", "Line 2"]
/// ```
pub fn line_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result = Vec::new();
let mut line_start = 0;
let mut chars = text.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c == '\n' {
// Add line content if any
if i > line_start {
result.push(text[line_start..i].into());
}
// Add newline
result.push("\n".into());
line_start = i + 1;
} else if c == '\r' && chars.peek() == Some(&(i + 1, '\n')) {
// Handle \r\n
if i > line_start {
result.push(text[line_start..i].into());
}
chars.next(); // consume \n
result.push("\r\n".into());
line_start = i + 2;
}
}
// Add final line if any
if line_start < text.len() {
result.push(text[line_start..].into());
}
result
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_with_snapshots() {
assert_debug_snapshot!(line_tokenizer(""));
assert_debug_snapshot!(line_tokenizer("Hello"));
assert_debug_snapshot!(line_tokenizer("Hello\nWorld"));
assert_debug_snapshot!(line_tokenizer("Hello\nWorld\n"));
assert_debug_snapshot!(line_tokenizer("Line 1\r\nLine 2"));
assert_debug_snapshot!(line_tokenizer("Multi\nLine\nText\nHere"));
assert_debug_snapshot!(line_tokenizer("\n"));
assert_debug_snapshot!(line_tokenizer("\n\n"));
assert_debug_snapshot!(line_tokenizer("Start\n\nEnd"));
}
}

View file

@ -0,0 +1,13 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Hello\")"
snapshot_kind: text
---
[
Token {
normalised: "Hello",
original: "Hello",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,25 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Hello\\nWorld\")"
snapshot_kind: text
---
[
Token {
normalised: "Hello",
original: "Hello",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "World",
original: "World",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,31 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Hello\\nWorld\\n\")"
snapshot_kind: text
---
[
Token {
normalised: "Hello",
original: "Hello",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "World",
original: "World",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,25 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Line 1\\r\\nLine 2\")"
snapshot_kind: text
---
[
Token {
normalised: "Line 1",
original: "Line 1",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\r\n",
original: "\r\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "Line 2",
original: "Line 2",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,49 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Multi\\nLine\\nText\\nHere\")"
snapshot_kind: text
---
[
Token {
normalised: "Multi",
original: "Multi",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "Line",
original: "Line",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "Text",
original: "Text",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "Here",
original: "Here",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,13 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"\\n\")"
snapshot_kind: text
---
[
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,19 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"\\n\\n\")"
snapshot_kind: text
---
[
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,31 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Start\\n\\nEnd\")"
snapshot_kind: text
---
[
Token {
normalised: "Start",
original: "Start",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "\n",
original: "\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "End",
original: "End",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,6 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"\")"
snapshot_kind: text
---
[]