Add line tokenizer
This commit is contained in:
parent
ee5776c8e1
commit
469e62106c
13 changed files with 324 additions and 50 deletions
73
src/lib.rs
73
src/lib.rs
|
|
@ -1,8 +1,13 @@
|
|||
//! # Reconcile
|
||||
//!
|
||||
//! A library for automatically merging two conflicting versions of a
|
||||
//! document. `Reconcile` is essentially `git merge` but without any conflict
|
||||
//! markers (or lost edits) in the output.
|
||||
//! [`diff3`](https://www.gnu.org/software/diffutils/manual/html_node/Invoking-diff3.html) (or `git merge`)
|
||||
//! but with automatic conflict resolution.
|
||||
//!
|
||||
//! Reconcile is a Rust and JavaScript (through WebAssembly) library for merging
|
||||
//! text without user intervention. It automatically resolves conflicts that
|
||||
//! would typically require user action in traditional 3-way merge tools.
|
||||
//!
|
||||
//! Try out the [interactive demo](https://schmelczer.dev/reconcile)!
|
||||
//!
|
||||
//! ```
|
||||
//! use reconcile::{reconcile, BuiltinTokenizer};
|
||||
|
|
@ -22,33 +27,17 @@
|
|||
//! configurable. By default, words are the atoms for merging and thus words
|
||||
//! can't get jumbled up at the end of reconciling.
|
||||
//!
|
||||
//! ### Word-level tokenization (default)
|
||||
//! ### Built-in tokenizers
|
||||
//!
|
||||
//! ```
|
||||
//! use reconcile::{reconcile, BuiltinTokenizer};
|
||||
//!
|
||||
//! let parent = "The quick brown fox";
|
||||
//! let left = "The very quick brown fox";
|
||||
//! let right = "The quick red fox";
|
||||
//! let parent = "The quick brown fox\n";
|
||||
//! let left = "The very quick brown fox\n";
|
||||
//! let right = "The quick red fox\n";
|
||||
//!
|
||||
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
|
||||
//! assert_eq!(result.apply().text(), "The very quick red fox");
|
||||
//! ```
|
||||
//!
|
||||
//! ### Character-level tokenization
|
||||
//!
|
||||
//! If finer grained merging is required, we can make every UTF-8 character
|
||||
//! become its own token:
|
||||
//!
|
||||
//! ```
|
||||
//! use reconcile::{reconcile, BuiltinTokenizer};
|
||||
//!
|
||||
//! let parent = "Hello";
|
||||
//! let left = "Helo"; // deleted 'l'
|
||||
//! let right = "Hello!"; // added '!'
|
||||
//!
|
||||
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Character);
|
||||
//! assert_eq!(result.apply().text(), "Helo!");
|
||||
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Line);
|
||||
//! assert_eq!(result.apply().text(), "The quick red foxThe very quick brown fox\n");
|
||||
//! ```
|
||||
//!
|
||||
//! ### Custom tokenization
|
||||
|
|
@ -62,7 +51,12 @@
|
|||
//! // Example with custom tokenizer - split by sentences
|
||||
//! let sentence_tokenizer = |text: &str| {
|
||||
//! text.split(". ")
|
||||
//! .map(|sentence| Token::new(sentence.to_string(), sentence.to_string(), true, true))
|
||||
//! .map(|sentence| Token::new(
|
||||
//! sentence.to_string(),
|
||||
//! sentence.to_string(),
|
||||
//! false, // don't allow joining token with the preceeding on
|
||||
//! false // don't allow joining token with the following one
|
||||
//! ))
|
||||
//! .collect::<Vec<_>>()
|
||||
//! };
|
||||
//!
|
||||
|
|
@ -74,6 +68,8 @@
|
|||
//! let result = reconcile(parent, &left.into(), &right.into(), &*BuiltinTokenizer::Word);
|
||||
//! assert_eq!(result.apply().text(), "Hello beautiful world. This is a great test.");
|
||||
//! ```
|
||||
//! > By setting the joinability to `false`, longer runs of inserts with be
|
||||
//! > interleaved like LRLRLR instead of LLLRRR.
|
||||
//!
|
||||
//! ## Cursors and selection ranges
|
||||
//!
|
||||
|
|
@ -103,29 +99,8 @@
|
|||
//!
|
||||
//! ## The algorithm
|
||||
//!
|
||||
//! The algorithm starts similarly to `diff3`. Its inputs are a **parent**
|
||||
//! document and two conflicting versions: `left` and `right` which have
|
||||
//! been created from the parent through any series of concurrent edits.
|
||||
//!
|
||||
//! When calling `reconcile(parent, left, right)`:
|
||||
//!
|
||||
//! 1. **Diff calculation**: 2-way diffs of (parent & left) and (parent & right)
|
||||
//! are computed using Myers' algorithm
|
||||
//! 2. **Tokenization**: The text is split into tokens at the configured
|
||||
//! granularity
|
||||
//! 3. **Operation transformation**: The resulting edits are weaved together
|
||||
//! using operational transformation principles, ensuring no changes are lost
|
||||
//! 4. **Conflict resolution**: Unlike traditional merge tools, conflicts are
|
||||
//! automatically resolved without producing conflict markers
|
||||
//!
|
||||
//! The key insight is that both insertions and deletions are preserved:
|
||||
//! - If either side inserted text, it appears in the result
|
||||
//! - If either side deleted text, the deletion is applied
|
||||
//! - Insertions into deleted regions are still preserved
|
||||
//!
|
||||
//! This approach works well for human-readable text where some "fuzziness" in
|
||||
//! conflict resolution is acceptable, unlike source code where precision is
|
||||
//! critical.
|
||||
//! For a discussion of the algorithm and architecture, see the
|
||||
//! [README](README.md#algorithm) page.
|
||||
|
||||
mod operation_transformation;
|
||||
mod raw_operation;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
mod character_tokenizer;
|
||||
mod line_tokenizer;
|
||||
mod word_tokenizer;
|
||||
|
||||
use std::ops::Deref;
|
||||
|
|
@ -20,6 +21,7 @@ pub type Tokenizer<T> = dyn Fn(&str) -> Vec<Token<T>>;
|
|||
#[cfg(feature = "wasm")]
|
||||
pub enum BuiltinTokenizer {
|
||||
Character = "Character",
|
||||
Line = "Line",
|
||||
Word = "Word",
|
||||
}
|
||||
|
||||
|
|
@ -28,6 +30,7 @@ pub enum BuiltinTokenizer {
|
|||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
pub enum BuiltinTokenizer {
|
||||
Character,
|
||||
Line,
|
||||
Word,
|
||||
}
|
||||
|
||||
|
|
@ -37,6 +40,7 @@ impl Deref for BuiltinTokenizer {
|
|||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BuiltinTokenizer::Character => &character_tokenizer::character_tokenizer,
|
||||
BuiltinTokenizer::Line => &line_tokenizer::line_tokenizer,
|
||||
BuiltinTokenizer::Word => &word_tokenizer::word_tokenizer,
|
||||
#[cfg(feature = "wasm")]
|
||||
BuiltinTokenizer::__Invalid => panic!("Unexpected tokenizer type"),
|
||||
|
|
|
|||
70
src/tokenizer/line_tokenizer.rs
Normal file
70
src/tokenizer/line_tokenizer.rs
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
use super::token::Token;
|
||||
|
||||
/// Splits text into lines, preserving line endings as separate tokens.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
/// ```not_rust
|
||||
/// "Hello\nWorld!" -> ["Hello", "\n", "World!"]
|
||||
/// "Line 1\r\nLine 2" -> ["Line 1", "\r\n", "Line 2"]
|
||||
/// ```
|
||||
pub fn line_tokenizer(text: &str) -> Vec<Token<String>> {
|
||||
let mut result = Vec::new();
|
||||
let mut line_start = 0;
|
||||
|
||||
let mut chars = text.char_indices().peekable();
|
||||
while let Some((i, c)) = chars.next() {
|
||||
if c == '\n' {
|
||||
// Add line content if any
|
||||
if i > line_start {
|
||||
result.push(text[line_start..i].into());
|
||||
}
|
||||
// Add newline
|
||||
result.push("\n".into());
|
||||
line_start = i + 1;
|
||||
} else if c == '\r' && chars.peek() == Some(&(i + 1, '\n')) {
|
||||
// Handle \r\n
|
||||
if i > line_start {
|
||||
result.push(text[line_start..i].into());
|
||||
}
|
||||
chars.next(); // consume \n
|
||||
result.push("\r\n".into());
|
||||
line_start = i + 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Add final line if any
|
||||
if line_start < text.len() {
|
||||
result.push(text[line_start..].into());
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_with_snapshots() {
|
||||
assert_debug_snapshot!(line_tokenizer(""));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Hello"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Hello\nWorld"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Hello\nWorld\n"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Line 1\r\nLine 2"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Multi\nLine\nText\nHere"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("\n"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("\n\n"));
|
||||
|
||||
assert_debug_snapshot!(line_tokenizer("Start\n\nEnd"));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Hello\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Hello",
|
||||
original: "Hello",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Hello\\nWorld\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Hello",
|
||||
original: "Hello",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "World",
|
||||
original: "World",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Hello\\nWorld\\n\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Hello",
|
||||
original: "Hello",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "World",
|
||||
original: "World",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Line 1\\r\\nLine 2\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Line 1",
|
||||
original: "Line 1",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\r\n",
|
||||
original: "\r\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "Line 2",
|
||||
original: "Line 2",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Multi\\nLine\\nText\\nHere\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Multi",
|
||||
original: "Multi",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "Line",
|
||||
original: "Line",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "Text",
|
||||
original: "Text",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "Here",
|
||||
original: "Here",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"\\n\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"\\n\\n\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"Start\\n\\nEnd\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Token {
|
||||
normalised: "Start",
|
||||
original: "Start",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "\n",
|
||||
original: "\n",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
Token {
|
||||
normalised: "End",
|
||||
original: "End",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
]
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
source: src/tokenizer/line_tokenizer.rs
|
||||
expression: "line_tokenizer(\"\")"
|
||||
snapshot_kind: text
|
||||
---
|
||||
[]
|
||||
Loading…
Add table
Add a link
Reference in a new issue