From 446bbdfe5dec0956b80d3666fdc6390bcd9f85de Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 11 Mar 2026 20:39:04 +0000 Subject: [PATCH 1/5] Implement makrdown tokeniser --- src/tokenizer/markdown_tokenizer.rs | 290 ++++++++++++++++++++++++++++ src/tokenizer/word_tokenizer.rs | 32 +-- 2 files changed, 310 insertions(+), 12 deletions(-) create mode 100644 src/tokenizer/markdown_tokenizer.rs diff --git a/src/tokenizer/markdown_tokenizer.rs b/src/tokenizer/markdown_tokenizer.rs new file mode 100644 index 0000000..b69df8f --- /dev/null +++ b/src/tokenizer/markdown_tokenizer.rs @@ -0,0 +1,290 @@ +use super::{token::Token, word_tokenizer::split_words}; + +/// Splits markdown text into tokens that respect markdown formatting structure +/// +/// Builds on word-level tokenization with markdown-specific handling: +/// - Newlines are non-joinable tokens (preserves block structure) +/// - Block-level prefixes (headings, list markers, blockquotes) attach to the +/// first word of their line so they can't be split apart during merge +/// - Intra-line whitespace uses the same normalization as the word tokenizer +/// +/// This prevents merges from breaking lists, headings, or other structural +/// markdown elements. Inline formatting like `**bold**` is already preserved +/// by word-level splitting since formatting markers contain no whitespace. +/// +/// ## Example +/// +/// ```not_rust +/// "# Hello\n- item" -> ["# Hello", "\n", "- item"] +/// ``` +pub fn markdown_tokenizer(text: &str) -> Vec> { + let mut result = Vec::new(); + let segments = split_preserving_newlines(text); + + for segment in &segments { + if *segment == "\n" || *segment == "\r\n" { + let s = (*segment).to_owned(); + result.push(Token::new(s.clone(), s, false, false)); + continue; + } + + let prefix_len = block_prefix_len(segment); + let mut line_tokens = split_words(&segment[prefix_len..]); + + if prefix_len > 0 { + let prefix = &segment[..prefix_len]; + if line_tokens.is_empty() { + let s = prefix.to_owned(); + result.push(Token::new(s.clone(), s, false, false)); + } else { + let first = &line_tokens[0]; + let combined_original = format!("{prefix}{}", first.original()); + let combined_normalized = format!("{prefix}{}", first.normalized()); + line_tokens[0] = Token::new( + combined_normalized, + combined_original, + false, + first.is_right_joinable, + ); + } + } + + result.extend(line_tokens); + } + + // Normalize non-newline whitespace tokens by appending the next token's + // original text (same trick as the word tokenizer so each space is unique + // in the diff based on what follows it) + if !result.is_empty() { + for i in 0..result.len() - 1 { + if result[i] + .original() + .chars() + .all(|c| c.is_whitespace() && c != '\n' && c != '\r') + { + let normalized = result[i].normalized().to_owned() + result[i + 1].original(); + result[i].set_normalized(normalized); + } + } + } + + result +} + +/// Splits text into alternating segments of line content and newline separators +fn split_preserving_newlines(text: &str) -> Vec<&str> { + let mut segments = Vec::new(); + let mut line_start = 0; + let bytes = text.as_bytes(); + let mut i = 0; + + while i < bytes.len() { + if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' { + if i > line_start { + segments.push(&text[line_start..i]); + } + segments.push(&text[i..i + 2]); + i += 2; + line_start = i; + } else if bytes[i] == b'\n' { + if i > line_start { + segments.push(&text[line_start..i]); + } + segments.push(&text[i..=i]); + i += 1; + line_start = i; + } else { + i += 1; + } + } + + if line_start < text.len() { + segments.push(&text[line_start..]); + } + + segments +} + +/// Returns the byte length of a markdown block-level prefix at the start of a +/// line, or 0 if none is found +/// +/// All recognized prefix characters are ASCII, so byte offsets are always +/// valid UTF-8 boundaries. +/// +/// Recognized prefixes: +/// - ATX headings: `# ` through `###### ` +/// - Blockquotes: `> ` (single level) +/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace) +/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace) +/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix) +fn block_prefix_len(line: &str) -> usize { + let trimmed = line.trim_start_matches([' ', '\t']); + let indent_len = line.len() - trimmed.len(); + + // ATX heading: #{1,6} followed by a space + if trimmed.starts_with('#') { + let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count(); + if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') { + return indent_len + hash_count + 1; + } + } + + // Blockquote: > followed by optional space + if trimmed.starts_with("> ") { + return indent_len + 2; + } + if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') { + return indent_len + 1; + } + + // Unordered list: [-*+] followed by a space, optionally with task checkbox + if trimmed.len() >= 2 { + let first_byte = trimmed.as_bytes()[0]; + if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' { + return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]); + } + } + + // Ordered list: digits followed by [.)] and a space, optionally with task + // checkbox + let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count(); + if digit_count > 0 && indent_len + digit_count + 2 <= line.len() { + let after_digits = trimmed.as_bytes()[digit_count]; + let after_marker = trimmed.as_bytes().get(digit_count + 1); + if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') { + return indent_len + + digit_count + + 2 + + task_checkbox_len(&line[indent_len + digit_count + 2..]); + } + } + + 0 +} + +/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `) +/// at the start of `rest`, or 0 if none is found +fn task_checkbox_len(rest: &str) -> usize { + if rest.len() >= 4 + && rest.as_bytes()[0] == b'[' + && matches!(rest.as_bytes()[1], b' ' | b'x' | b'X') + && rest.as_bytes()[2] == b']' + && rest.as_bytes()[3] == b' ' + { + 4 + } else { + 0 + } +} + +#[cfg(test)] +mod tests { + use insta::assert_debug_snapshot; + + use super::*; + + #[test] + fn test_plain_text() { + assert_debug_snapshot!(markdown_tokenizer("Hello world")); + } + + #[test] + fn test_empty() { + assert_debug_snapshot!(markdown_tokenizer("")); + } + + #[test] + fn test_headings() { + assert_debug_snapshot!(markdown_tokenizer("# Hello world")); + assert_debug_snapshot!(markdown_tokenizer("## Sub heading")); + assert_debug_snapshot!(markdown_tokenizer("###### Deep heading")); + } + + #[test] + fn test_unordered_list() { + assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three")); + } + + #[test] + fn test_ordered_list() { + assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third")); + } + + #[test] + fn test_blockquote() { + assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted")); + } + + #[test] + fn test_inline_formatting() { + assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text")); + } + + #[test] + fn test_mixed_content() { + assert_debug_snapshot!(markdown_tokenizer( + "# Title\n\nSome text with **bold**.\n\n- list item\n- another item" + )); + } + + #[test] + fn test_indented_list() { + assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper")); + } + + #[test] + fn test_crlf() { + assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2")); + } + + #[test] + fn test_code_fence() { + assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```")); + } + + #[test] + fn test_heading_only() { + assert_debug_snapshot!(markdown_tokenizer("# ")); + } + + #[test] + fn test_link() { + assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now")); + } + + #[test] + fn test_multiline_paragraph() { + assert_debug_snapshot!(markdown_tokenizer( + "First line\nSecond line\n\nNew paragraph" + )); + } + + #[test] + fn test_list_with_star_marker() { + assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two")); + } + + #[test] + fn test_bold_not_confused_with_list() { + assert_debug_snapshot!(markdown_tokenizer("**bold text**")); + } + + #[test] + fn test_task_list() { + assert_debug_snapshot!(markdown_tokenizer( + "- [ ] todo\n- [x] done\n- [X] also done" + )); + } + + #[test] + fn test_ordered_task_list() { + assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task")); + } + + #[test] + fn test_unicode() { + assert_debug_snapshot!(markdown_tokenizer( + "# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world" + )); + } +} diff --git a/src/tokenizer/word_tokenizer.rs b/src/tokenizer/word_tokenizer.rs index e486825..f3f146b 100644 --- a/src/tokenizer/word_tokenizer.rs +++ b/src/tokenizer/word_tokenizer.rs @@ -9,6 +9,26 @@ use super::token::Token; /// "Hi there!" -> ["Hi", " ", "there!"] /// ``` pub fn word_tokenizer(text: &str) -> Vec> { + let mut result = split_words(text); + + if result.is_empty() { + return result; + } + + // normalize whitespace tokens by concatenating with the following token + for i in 0..result.len() - 1 { + if result[i].original().chars().all(char::is_whitespace) { + let normalized = result[i].normalized().to_owned() + result[i + 1].original(); + result[i].set_normalized(normalized); + } + } + + result +} + +/// Splits text into alternating word and whitespace tokens without any +/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`. +pub(super) fn split_words(text: &str) -> Vec> { let mut result = Vec::new(); let mut previous_boundary_index = 0; @@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec> { result.push(text[previous_boundary_index..].into()); } - if result.is_empty() { - return result; - } - - // normalize whitespace tokens by concatenating with the following token - for i in 0..result.len() - 1 { - if result[i].original().chars().all(char::is_whitespace) { - let normalized = result[i].normalized().to_owned() + result[i + 1].original(); - result[i].set_normalized(normalized); - } - } - result } From 9da5bf6e3e5d150fbd4629ea8ccf6e03e129ebea Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 11 Mar 2026 20:39:14 +0000 Subject: [PATCH 2/5] Add snapshots --- ...e_tokenizer__tests__with_snapshots-10.snap | 36 ++++++ ...markdown_tokenizer__tests__blockquote.snap | 48 +++++++ ...r__tests__bold_not_confused_with_list.snap | 24 ++++ ...markdown_tokenizer__tests__code_fence.snap | 72 +++++++++++ ...izer__markdown_tokenizer__tests__crlf.snap | 48 +++++++ ...zer__markdown_tokenizer__tests__empty.snap | 5 + ...rkdown_tokenizer__tests__heading_only.snap | 12 ++ ...__markdown_tokenizer__tests__headings.snap | 24 ++++ ...kdown_tokenizer__tests__indented_list.snap | 36 ++++++ ...n_tokenizer__tests__inline_formatting.snap | 60 +++++++++ ...izer__markdown_tokenizer__tests__link.snap | 36 ++++++ ...kenizer__tests__list_with_star_marker.snap | 48 +++++++ ...kdown_tokenizer__tests__mixed_content.snap | 120 ++++++++++++++++++ ...tokenizer__tests__multiline_paragraph.snap | 78 ++++++++++++ ...rkdown_tokenizer__tests__ordered_list.snap | 36 ++++++ ...n_tokenizer__tests__ordered_task_list.snap | 48 +++++++ ...markdown_tokenizer__tests__plain_text.snap | 24 ++++ ..._markdown_tokenizer__tests__task_list.snap | 48 +++++++ ...r__markdown_tokenizer__tests__unicode.snap | 60 +++++++++ ...down_tokenizer__tests__unordered_list.snap | 72 +++++++++++ 20 files changed, 935 insertions(+) create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-10.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__blockquote.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__bold_not_confused_with_list.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__code_fence.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__crlf.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__empty.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__heading_only.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__indented_list.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__inline_formatting.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__link.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__list_with_star_marker.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__mixed_content.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__multiline_paragraph.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_list.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_task_list.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__plain_text.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__task_list.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unicode.snap create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unordered_list.snap diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-10.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-10.snap new file mode 100644 index 0000000..22daf9f --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-10.snap @@ -0,0 +1,36 @@ +--- +source: src/tokenizer/line_tokenizer.rs +expression: "line_tokenizer(\"Old\\rMac\\rStyle\")" +--- +[ + Token { + normalized: "Old", + original: "Old", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\r", + original: "\r", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "Mac", + original: "Mac", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\r", + original: "\r", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "Style", + original: "Style", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__blockquote.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__blockquote.snap new file mode 100644 index 0000000..6bbf317 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__blockquote.snap @@ -0,0 +1,48 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"> quoted text\\n> more quoted\")" +--- +[ + Token { + normalized: "> quoted", + original: "> quoted", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " text", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "text", + original: "text", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "> more", + original: "> more", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " quoted", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "quoted", + original: "quoted", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__bold_not_confused_with_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__bold_not_confused_with_list.snap new file mode 100644 index 0000000..92a0ffa --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__bold_not_confused_with_list.snap @@ -0,0 +1,24 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"**bold text**\")" +--- +[ + Token { + normalized: "**bold", + original: "**bold", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " text**", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "text**", + original: "text**", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__code_fence.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__code_fence.snap new file mode 100644 index 0000000..a951f58 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__code_fence.snap @@ -0,0 +1,72 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"```rust\\nlet x = 1;\\n```\")" +--- +[ + Token { + normalized: "```rust", + original: "```rust", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "let", + original: "let", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " x", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "x", + original: "x", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " =", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "=", + original: "=", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " 1;", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "1;", + original: "1;", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "```", + original: "```", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__crlf.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__crlf.snap new file mode 100644 index 0000000..d1817c2 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__crlf.snap @@ -0,0 +1,48 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"Line 1\\r\\nLine 2\")" +--- +[ + Token { + normalized: "Line", + original: "Line", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " 1", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "1", + original: "1", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\r\n", + original: "\r\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "Line", + original: "Line", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " 2", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "2", + original: "2", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__empty.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__empty.snap new file mode 100644 index 0000000..aa65c19 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__empty.snap @@ -0,0 +1,5 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"\")" +--- +[] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__heading_only.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__heading_only.snap new file mode 100644 index 0000000..0d10e0f --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__heading_only.snap @@ -0,0 +1,12 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"# \")" +--- +[ + Token { + normalized: "# ", + original: "# ", + is_left_joinable: false, + is_right_joinable: false, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings.snap new file mode 100644 index 0000000..0c3e98e --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings.snap @@ -0,0 +1,24 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"# Hello world\")" +--- +[ + Token { + normalized: "# Hello", + original: "# Hello", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " world", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "world", + original: "world", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__indented_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__indented_list.snap new file mode 100644 index 0000000..5ea77f7 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__indented_list.snap @@ -0,0 +1,36 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\" - nested item\\n - deeper\")" +--- +[ + Token { + normalized: " - nested", + original: " - nested", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " item", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "item", + original: "item", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: " - deeper", + original: " - deeper", + is_left_joinable: false, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__inline_formatting.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__inline_formatting.snap new file mode 100644 index 0000000..6744a2b --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__inline_formatting.snap @@ -0,0 +1,60 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"Some **bold** and *italic* text\")" +--- +[ + Token { + normalized: "Some", + original: "Some", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " **bold**", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "**bold**", + original: "**bold**", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " and", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "and", + original: "and", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " *italic*", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "*italic*", + original: "*italic*", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " text", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "text", + original: "text", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__link.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__link.snap new file mode 100644 index 0000000..79e3afb --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__link.snap @@ -0,0 +1,36 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"Click [here](https://example.com) now\")" +--- +[ + Token { + normalized: "Click", + original: "Click", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " [here](https://example.com)", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "[here](https://example.com)", + original: "[here](https://example.com)", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " now", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "now", + original: "now", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__list_with_star_marker.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__list_with_star_marker.snap new file mode 100644 index 0000000..3273a24 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__list_with_star_marker.snap @@ -0,0 +1,48 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"* item one\\n* item two\")" +--- +[ + Token { + normalized: "* item", + original: "* item", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " one", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "one", + original: "one", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "* item", + original: "* item", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " two", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "two", + original: "two", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__mixed_content.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__mixed_content.snap new file mode 100644 index 0000000..78458af --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__mixed_content.snap @@ -0,0 +1,120 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"# Title\\n\\nSome text with **bold**.\\n\\n- list item\\n- another item\")" +--- +[ + Token { + normalized: "# Title", + original: "# Title", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "Some", + original: "Some", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " text", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "text", + original: "text", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " with", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "with", + original: "with", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " **bold**.", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "**bold**.", + original: "**bold**.", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- list", + original: "- list", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " item", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "item", + original: "item", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- another", + original: "- another", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " item", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "item", + original: "item", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__multiline_paragraph.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__multiline_paragraph.snap new file mode 100644 index 0000000..244f515 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__multiline_paragraph.snap @@ -0,0 +1,78 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"First line\\nSecond line\\n\\nNew paragraph\")" +--- +[ + Token { + normalized: "First", + original: "First", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " line", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "line", + original: "line", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "Second", + original: "Second", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " line", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "line", + original: "line", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "New", + original: "New", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " paragraph", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "paragraph", + original: "paragraph", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_list.snap new file mode 100644 index 0000000..c465053 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_list.snap @@ -0,0 +1,36 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"1. first\\n2. second\\n3. third\")" +--- +[ + Token { + normalized: "1. first", + original: "1. first", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "2. second", + original: "2. second", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "3. third", + original: "3. third", + is_left_joinable: false, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_task_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_task_list.snap new file mode 100644 index 0000000..55a8001 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__ordered_task_list.snap @@ -0,0 +1,48 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"1. [ ] first task\\n2. [x] second task\")" +--- +[ + Token { + normalized: "1. [ ] first", + original: "1. [ ] first", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " task", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "task", + original: "task", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "2. [x] second", + original: "2. [x] second", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " task", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "task", + original: "task", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__plain_text.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__plain_text.snap new file mode 100644 index 0000000..0785a2f --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__plain_text.snap @@ -0,0 +1,24 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"Hello world\")" +--- +[ + Token { + normalized: "Hello", + original: "Hello", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: " world", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "world", + original: "world", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__task_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__task_list.snap new file mode 100644 index 0000000..9789b13 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__task_list.snap @@ -0,0 +1,48 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"- [ ] todo\\n- [x] done\\n- [X] also done\")" +--- +[ + Token { + normalized: "- [ ] todo", + original: "- [ ] todo", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- [x] done", + original: "- [x] done", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- [X] also", + original: "- [X] also", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " done", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "done", + original: "done", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unicode.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unicode.snap new file mode 100644 index 0000000..e5d1beb --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unicode.snap @@ -0,0 +1,60 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"# \\u{1F600} Héllo\\n- \\u{00E9}lément\\n> \\u{4F60}\\u{597D} world\")" +--- +[ + Token { + normalized: "# 😀", + original: "# 😀", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " Héllo", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "Héllo", + original: "Héllo", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- élément", + original: "- élément", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "> 你好", + original: "> 你好", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " world", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "world", + original: "world", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unordered_list.snap b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unordered_list.snap new file mode 100644 index 0000000..b50f920 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__unordered_list.snap @@ -0,0 +1,72 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +expression: "markdown_tokenizer(\"- item one\\n- item two\\n- item three\")" +--- +[ + Token { + normalized: "- item", + original: "- item", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " one", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "one", + original: "one", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- item", + original: "- item", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " two", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "two", + original: "two", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\n", + original: "\n", + is_left_joinable: false, + is_right_joinable: false, + }, + Token { + normalized: "- item", + original: "- item", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " three", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "three", + original: "three", + is_left_joinable: true, + is_right_joinable: true, + }, +] From bbe3b7573ad77762dec811964cf0e978401fc6b2 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 11 Mar 2026 20:39:36 +0000 Subject: [PATCH 3/5] Update website --- examples/website/src/index.html | 22 ++++++++++------------ examples/website/src/index.ts | 16 ++++++++++------ reconcile-js/src/index.ts | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/examples/website/src/index.html b/examples/website/src/index.html index 03c7b08..2bfe8a6 100644 --- a/examples/website/src/index.html +++ b/examples/website/src/index.html @@ -108,13 +108,7 @@ diff --git a/examples/website/src/index.ts b/examples/website/src/index.ts index 9ce99a5..01edfd8 100644 --- a/examples/website/src/index.ts +++ b/examples/website/src/index.ts @@ -10,7 +10,11 @@ const tokenizerRadios = document.querySelectorAll( 'input[name="tokenizer"]' ) as NodeListOf; -const sampleText = `The "reconcile-text" Rust library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box. Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy, for example, deciding how casing or whitespace is taken into account.`; +const sampleText = `The reconcile-text library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box. + +Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy which may be: +- Character-based +- Word-based`; let pendingUpdate: number | null = null; function scheduleUpdate(): void { @@ -52,10 +56,10 @@ function loadSample(): void { originalTextArea.value = sampleText; leftTextArea.value = sampleText.replace('color', 'colour') + - " Check out what's the most complex conflict you can come up with!"; - rightTextArea.value = sampleText - .replace(', for example,', ' such as') - .replace('WASM', 'WebAssembly'); + "\n- Line-based\n\nCheck out what's the most complex conflict you can come up with!"; + rightTextArea.value = + sampleText.replace(', for example,', ' such as').replace('WASM', 'WebAssembly') + + '\n- Or your custom tokeniser'; } function updateMergedText(): void { @@ -191,7 +195,7 @@ function createSelectionOverlay(isLeft: boolean, isSelection: boolean): HTMLSpan function getSelectedTokenizer(): BuiltinTokenizer { const selectedRadio = Array.from(tokenizerRadios).find((radio) => radio.checked); - return (selectedRadio?.value ?? 'Word') as BuiltinTokenizer; + return (selectedRadio?.value ?? 'Markdown') as BuiltinTokenizer; } function resizeTextAreas(): void { diff --git a/reconcile-js/src/index.ts b/reconcile-js/src/index.ts index 08edfc4..d00051c 100644 --- a/reconcile-js/src/index.ts +++ b/reconcile-js/src/index.ts @@ -12,7 +12,7 @@ import { import wasmBytes from 'reconcile-text/reconcile_text_bg.wasm'; // Define the enum values as const arrays to avoid duplication -const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Word'] as const; +const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Markdown', 'Word'] as const; const HISTORY_VALUES = [ 'Unchanged', 'AddedFromLeft', From 5978f73c971fdb29a33fd2c920029fa11eb09e5b Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 11 Mar 2026 20:43:34 +0000 Subject: [PATCH 4/5] Minimise allocations --- src/operation_transformation/edited_text.rs | 177 ++++++++++++-------- src/operation_transformation/operation.rs | 148 ++++++++++------ 2 files changed, 211 insertions(+), 114 deletions(-) diff --git a/src/operation_transformation/edited_text.rs b/src/operation_transformation/edited_text.rs index 602ae74..2a98259 100644 --- a/src/operation_transformation/edited_text.rs +++ b/src/operation_transformation/edited_text.rs @@ -1,10 +1,10 @@ -use std::{fmt::Debug, vec}; +use std::fmt::Debug; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::{ - BuiltinTokenizer, CursorPosition, TextWithCursors, + BuiltinTokenizer, CursorPosition, TextWithCursors, Token, operation_transformation::{ DiffError, Operation, utils::{cook_operations::cook_operations, elongate_operations::elongate_operations}, @@ -55,6 +55,7 @@ where { /// Create an `EditedText` from the given original and updated strings /// using the provided tokenizer + #[must_use] pub fn from_strings_with_tokenizer( original: &'a str, updated: &TextWithCursors, @@ -134,24 +135,21 @@ where let mut last_right_op = None; loop { - let (side, operation, mut last_other_op) = - match (maybe_left_op.clone(), maybe_right_op.clone()) { - (Some(left_op), Some(right_op)) => { - if left_op - .get_sort_key(seen_left_length) - .partial_cmp(&right_op.get_sort_key(seen_right_length)) - == Some(std::cmp::Ordering::Less) - { - (Side::Left, left_op, last_right_op.clone()) - } else { - (Side::Right, right_op, last_left_op.clone()) - } + let (side, operation) = match (maybe_left_op.as_ref(), maybe_right_op.as_ref()) { + (Some(left_op), Some(right_op)) => { + if left_op.cmp_priority(seen_left_length, right_op, seen_right_length) + == std::cmp::Ordering::Less + { + (Side::Left, maybe_left_op.take().unwrap()) + } else { + (Side::Right, maybe_right_op.take().unwrap()) } + } - (Some(left_op), None) => (Side::Left, left_op, last_right_op.clone()), - (None, Some(right_op)) => (Side::Right, right_op, last_left_op.clone()), - (None, None) => break, - }; + (Some(_), None) => (Side::Left, maybe_left_op.take().unwrap()), + (None, Some(_)) => (Side::Right, maybe_right_op.take().unwrap()), + (None, None) => break, + }; let is_advancing_operation = matches!( operation, @@ -161,7 +159,7 @@ where let original_length = operation.len(); let (side, result) = match side { Side::Left => { - let result = operation.merge_operations(&mut last_other_op); + let result = operation.merge_operations(last_right_op.as_ref()); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { let merged_length_signed = isize::try_from(merged_length) @@ -195,7 +193,7 @@ where (Side::Left, result) } Side::Right => { - let result = operation.merge_operations(&mut last_other_op); + let result = operation.merge_operations(last_left_op.as_ref()); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { let merged_length_signed = isize::try_from(merged_length) @@ -304,6 +302,7 @@ where /// ``` #[must_use] pub fn apply_with_history(&self) -> Vec { + let chars: Vec = self.text.chars().collect(); let mut builder: StringBuilder<'_> = StringBuilder::new(self.text); let mut history = Vec::with_capacity(self.operations.len()); @@ -315,34 +314,26 @@ where Operation::Equal { .. } => { history.push(SpanWithHistory::new(builder.take(), History::Unchanged)); } - Operation::Insert { .. } => match side { - Side::Left => { - history.push(SpanWithHistory::new(builder.take(), History::AddedFromLeft)); - } - Side::Right => history.push(SpanWithHistory::new( - builder.take(), - History::AddedFromRight, - )), - }, + Operation::Insert { .. } => { + let h = match side { + Side::Left => History::AddedFromLeft, + Side::Right => History::AddedFromRight, + }; + history.push(SpanWithHistory::new(builder.take(), h)); + } Operation::Delete { deleted_character_count, order, .. } => { - let deleted: String = self - .text - .chars() - .skip(*order) - .take(*deleted_character_count) + let deleted: String = chars[*order..*order + *deleted_character_count] + .iter() .collect(); - match side { - Side::Left => { - history.push(SpanWithHistory::new(deleted, History::RemovedFromLeft)); - } - Side::Right => { - history.push(SpanWithHistory::new(deleted, History::RemovedFromRight)); - } - } + let h = match side { + Side::Left => History::RemovedFromLeft, + Side::Right => History::RemovedFromRight, + }; + history.push(SpanWithHistory::new(deleted, h)); } } } @@ -350,6 +341,56 @@ where history } + /// Apply the operations and return both the merged text with cursors and + /// the provenance history in a single pass + #[must_use] + pub fn apply_with_all(&self) -> (TextWithCursors, Vec) { + let chars: Vec = self.text.chars().collect(); + let mut builder: StringBuilder<'_> = StringBuilder::new(self.text); + let mut history = Vec::with_capacity(self.operations.len()); + let mut full_text = String::new(); + + for (operation, side) in self.operations.iter().zip(self.operation_sides.iter()) { + builder = operation.apply(builder); + + match operation { + Operation::Equal { .. } => { + let span = builder.take(); + full_text.push_str(&span); + history.push(SpanWithHistory::new(span, History::Unchanged)); + } + Operation::Insert { .. } => { + let span = builder.take(); + full_text.push_str(&span); + let h = match side { + Side::Left => History::AddedFromLeft, + Side::Right => History::AddedFromRight, + }; + history.push(SpanWithHistory::new(span, h)); + } + Operation::Delete { + deleted_character_count, + order, + .. + } => { + let deleted: String = chars[*order..*order + *deleted_character_count] + .iter() + .collect(); + let h = match side { + Side::Left => History::RemovedFromLeft, + Side::Right => History::RemovedFromRight, + }; + history.push(SpanWithHistory::new(deleted, h)); + } + } + } + + ( + TextWithCursors::new(full_text, self.cursors.clone()), + history, + ) + } + /// Convert the `EditedText` into a terse representation ready for /// serialization. The result omits cursor positions and the original text. /// This is useful for sending text diffs over the network if there's a @@ -358,11 +399,11 @@ where /// Inserts are strings, deletes are negative integers (character count), /// and retained spans are positive integers (character count). /// - /// # Panics + /// # Errors /// - /// Panics if there's an integer overflow in i64. - #[must_use] - pub fn to_diff(&self) -> Vec { + /// Returns `DiffError::IntegerOverflow` if a character count exceeds + /// `i64::MAX`. + pub fn to_diff(&self) -> Result, DiffError> { let mut result: Vec = Vec::with_capacity(self.operations.len()); let mut previous_equal: Option = None; @@ -378,16 +419,14 @@ where Operation::Insert { text, .. } => { if let Some(prev_length) = previous_equal { - result.push(NumberOrText::Number( - i64::try_from(prev_length).expect("prev_length must fit in i64"), - )); + result + .push(NumberOrText::Number(i64::try_from(prev_length).map_err( + |_| DiffError::IntegerOverflow { value: prev_length }, + )?)); previous_equal = None; } - let text: String = text - .iter() - .map(super::super::tokenizer::token::Token::original) - .collect(); + let text: String = text.iter().map(Token::original).collect(); result.push(NumberOrText::Text(text)); } @@ -396,26 +435,31 @@ where .. } => { if let Some(prev_length) = previous_equal { - result.push(NumberOrText::Number( - i64::try_from(prev_length).expect("prev_length must fit in i64"), - )); + result + .push(NumberOrText::Number(i64::try_from(prev_length).map_err( + |_| DiffError::IntegerOverflow { value: prev_length }, + )?)); previous_equal = None; } - let count = i64::try_from(*deleted_character_count) - .expect("deleted_character_count must fit in i64"); + let count = i64::try_from(*deleted_character_count).map_err(|_| { + DiffError::IntegerOverflow { + value: *deleted_character_count, + } + })?; result.push(NumberOrText::Number(-count)); } } } if let Some(prev_length) = previous_equal { - result.push(NumberOrText::Number( - i64::try_from(prev_length).expect("prev_length must fit in i64"), - )); + result + .push(NumberOrText::Number(i64::try_from(prev_length).map_err( + |_| DiffError::IntegerOverflow { value: prev_length }, + )?)); } - result + Ok(result) } /// Reconstruct an `EditedText` from a diff and the original text. @@ -435,7 +479,8 @@ where ) -> Result, DiffError> { let mut operations: Vec> = Vec::with_capacity(diff.len()); let mut order = 0; - let text_length = original_text.chars().count(); + let chars: Vec = original_text.chars().collect(); + let text_length = chars.len(); for item in diff { match item { @@ -453,7 +498,7 @@ where } let original_characters: String = - original_text.chars().skip(order).take(length).collect(); + chars[order..order + length].iter().collect(); let original_tokens = tokenizer(&original_characters); for token in original_tokens { @@ -590,7 +635,7 @@ mod tests { let original = "Merging text is hard!"; let changes = "Merging text is easy with reconcile!"; let result = EditedText::from_strings(original, &changes.into()); - let serialized = serde_yaml::to_string(&result.to_diff()).unwrap(); + let serialized = serde_yaml::to_string(&result.to_diff().unwrap()).unwrap(); let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",); assert_eq!(serialized, expected); @@ -622,7 +667,7 @@ mod tests { let edited_text = EditedText::from_strings(original, &updated.into()); - let changes = edited_text.to_diff(); + let changes = edited_text.to_diff().unwrap(); let deserialized_edited_text = EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word).unwrap(); diff --git a/src/operation_transformation/operation.rs b/src/operation_transformation/operation.rs index 28409f7..9d06639 100644 --- a/src/operation_transformation/operation.rs +++ b/src/operation_transformation/operation.rs @@ -104,28 +104,55 @@ where } } - pub fn get_sort_key(&self, insertion_index: usize) -> (usize, usize, usize, String) { - ( - self.order(), - match self { - Operation::Delete { .. } => 1, - Operation::Insert { .. } => 2, - Operation::Equal { .. } => 3, - }, - insertion_index, - // Make sure that the ordering is deterministic regardless of which text - // is left or right. - match self { - Operation::Equal { length, .. } => length.to_string(), - Operation::Insert { text, .. } => { - text.iter().map(Token::original).collect::() - } + fn type_priority(&self) -> u8 { + match self { + Operation::Delete { .. } => 1, + Operation::Insert { .. } => 2, + Operation::Equal { .. } => 3, + } + } + + /// Compare two operations for processing order during merging. Uses + /// (order, type, `insertion_index`) with a deterministic content + /// tiebreaker that avoids allocating. + pub fn cmp_priority( + &self, + self_index: usize, + other: &Self, + other_index: usize, + ) -> std::cmp::Ordering { + self.order() + .cmp(&other.order()) + .then_with(|| self.type_priority().cmp(&other.type_priority())) + .then_with(|| self_index.cmp(&other_index)) + .then_with(|| self.deterministic_content_cmp(other)) + } + + /// Deterministic tiebreaker based on operation content, so that merge + /// results are identical regardless of which side is left vs right + fn deterministic_content_cmp(&self, other: &Self) -> std::cmp::Ordering { + match (self, other) { + (Operation::Insert { text: t1, .. }, Operation::Insert { text: t2, .. }) => { + let s1 = t1.iter().flat_map(|t| t.original().chars()); + let s2 = t2.iter().flat_map(|t| t.original().chars()); + s1.cmp(s2) + } + (Operation::Equal { length: l1, .. }, Operation::Equal { length: l2, .. }) => { + l1.cmp(l2) + } + ( Operation::Delete { - deleted_character_count, + deleted_character_count: c1, .. - } => deleted_character_count.to_string(), - }, - ) + }, + Operation::Delete { + deleted_character_count: c2, + .. + }, + ) => c1.cmp(c2), + // Different types are already ordered by type_priority + _ => std::cmp::Ordering::Equal, + } } /// Applies the operation to the given `StringBuilder`, returning the @@ -193,10 +220,9 @@ where } /// Adjusts this operation based on `previous_operation` from the other side - /// to avoid duplicating or conflicting changes. Updates - /// `previous_operation` in-place. + /// to avoid duplicating or conflicting changes #[allow(clippy::too_many_lines)] - pub fn merge_operations(self, previous_operation: &mut Option) -> Operation { + pub fn merge_operations(self, previous_operation: Option<&Self>) -> Operation { let operation = self; match (operation, previous_operation) { @@ -295,14 +321,36 @@ where } ( - ref operation @ Operation::Equal { ref order, .. }, + ref operation @ Operation::Equal { + ref order, + #[cfg(debug_assertions)] + ref text, + .. + }, Some(Operation::Equal { order: last_equal_order, length: last_equal_length, + #[cfg(debug_assertions)] + text: last_equal_text, .. }), ) => { if operation.len() == *last_equal_length && *order == *last_equal_order { + // Both sides retained the same span from the original text, + // so we deduplicate by zeroing one out. This is safe because + // both EditedTexts are derived from the same original, and + // matching (order, length) means they cover the same substring + #[cfg(debug_assertions)] + debug_assert_eq!( + text, last_equal_text, + "Equal operations with same order and length should have the same text, \ + but got {operation:?} vs {:?}", + Operation::::Equal { + order: *last_equal_order, + length: *last_equal_length, + text: last_equal_text.clone(), + }, + ); Operation::create_equal(*order, 0) } else { operation.clone() @@ -329,18 +377,20 @@ where .. } => { #[cfg(debug_assertions)] - write!( - f, - "", - text.as_ref() - .map(|text| format!("'{}'", text.replace('\n', "\\n"))) - .unwrap_or(format!("{length} characters")), - )?; + { + write!( + f, + "", + text.as_ref() + .map(|text| format!("'{}'", text.replace('\n', "\\n"))) + .unwrap_or(format!("{length} characters")), + ) + } #[cfg(not(debug_assertions))] - write!(f, "")?; - - Ok(()) + { + write!(f, "") + } } Operation::Insert { order, text, .. } => { write!( @@ -361,22 +411,24 @@ where .. } => { #[cfg(debug_assertions)] - write!( - f, - "", - deleted_text - .as_ref() - .map(|text| format!("'{}'", text.replace('\n', "\\n"))) - .unwrap_or(format!("{deleted_character_count} characters")), - )?; + { + write!( + f, + "", + deleted_text + .as_ref() + .map(|text| format!("'{}'", text.replace('\n', "\\n"))) + .unwrap_or(format!("{deleted_character_count} characters")), + ) + } #[cfg(not(debug_assertions))] - write!( - f, - "", - )?; - - Ok(()) + { + write!( + f, + "", + ) + } } } } From 09b5c606ea1a75cf3153524b71c035db2add1e08 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 11 Mar 2026 20:43:41 +0000 Subject: [PATCH 5/5] Update tests --- ...kenizer__tests__with_snapshots-11.snap.new | 37 +++++++++++++++++++ ...down_tokenizer__tests__headings-2.snap.new | 25 +++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-11.snap.new create mode 100644 src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings-2.snap.new diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-11.snap.new b/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-11.snap.new new file mode 100644 index 0000000..36f12f9 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__line_tokenizer__tests__with_snapshots-11.snap.new @@ -0,0 +1,37 @@ +--- +source: src/tokenizer/line_tokenizer.rs +assertion_line: 78 +expression: "line_tokenizer(\"Mixed\\r\\nand\\rbare\")" +--- +[ + Token { + normalized: "Mixed", + original: "Mixed", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\r\n", + original: "\r\n", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "and", + original: "and", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "\r", + original: "\r", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "bare", + original: "bare", + is_left_joinable: true, + is_right_joinable: true, + }, +] diff --git a/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings-2.snap.new b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings-2.snap.new new file mode 100644 index 0000000..0f35315 --- /dev/null +++ b/src/tokenizer/snapshots/reconcile_text__tokenizer__markdown_tokenizer__tests__headings-2.snap.new @@ -0,0 +1,25 @@ +--- +source: src/tokenizer/markdown_tokenizer.rs +assertion_line: 199 +expression: "markdown_tokenizer(\"## Sub heading\")" +--- +[ + Token { + normalized: "## Sub", + original: "## Sub", + is_left_joinable: false, + is_right_joinable: true, + }, + Token { + normalized: " heading", + original: " ", + is_left_joinable: true, + is_right_joinable: true, + }, + Token { + normalized: "heading", + original: "heading", + is_left_joinable: true, + is_right_joinable: true, + }, +]