Compare commits

...
Sign in to create a new pull request.

5 commits

Author SHA1 Message Date
09b5c606ea Update tests 2026-03-11 20:43:41 +00:00
5978f73c97 Minimise allocations 2026-03-11 20:43:34 +00:00
bbe3b7573a Update website 2026-03-11 20:39:36 +00:00
9da5bf6e3e Add snapshots 2026-03-11 20:39:14 +00:00
446bbdfe5d Implement makrdown tokeniser 2026-03-11 20:39:04 +00:00
29 changed files with 1539 additions and 145 deletions

View file

@ -108,13 +108,7 @@
</div>
</label>
<label class="radio-option">
<input
type="radio"
name="tokenizer"
value="Word"
id="tokenizer-word"
checked
/>
<input type="radio" name="tokenizer" value="Word" id="tokenizer-word" />
<span class="radio-custom" aria-hidden="true"></span>
<div class="radio-content">
<span class="radio-label">Word</span>
@ -122,13 +116,17 @@
</div>
</label>
<label class="radio-option">
<input type="radio" name="tokenizer" value="Line" id="tokenizer-line" />
<input
type="radio"
name="tokenizer"
value="Markdown"
id="tokenizer-markdown"
checked
/>
<span class="radio-custom" aria-hidden="true"></span>
<div class="radio-content">
<span class="radio-label">Line</span>
<span class="radio-description"
>Line-by-line, like <code>git merge</code></span
>
<span class="radio-label">Markdown</span>
<span class="radio-description">Preserve formatting</span>
</div>
</label>
</div>

View file

@ -10,7 +10,11 @@ const tokenizerRadios = document.querySelectorAll(
'input[name="tokenizer"]'
) as NodeListOf<HTMLInputElement>;
const sampleText = `The "reconcile-text" Rust library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box. Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy, for example, deciding how casing or whitespace is taken into account.`;
const sampleText = `The reconcile-text library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box.
Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy which may be:
- Character-based
- Word-based`;
let pendingUpdate: number | null = null;
function scheduleUpdate(): void {
@ -52,10 +56,10 @@ function loadSample(): void {
originalTextArea.value = sampleText;
leftTextArea.value =
sampleText.replace('color', 'colour') +
" Check out what's the most complex conflict you can come up with!";
rightTextArea.value = sampleText
.replace(', for example,', ' such as')
.replace('WASM', 'WebAssembly');
"\n- Line-based\n\nCheck out what's the most complex conflict you can come up with!";
rightTextArea.value =
sampleText.replace(', for example,', ' such as').replace('WASM', 'WebAssembly') +
'\n- Or your custom tokeniser';
}
function updateMergedText(): void {
@ -191,7 +195,7 @@ function createSelectionOverlay(isLeft: boolean, isSelection: boolean): HTMLSpan
function getSelectedTokenizer(): BuiltinTokenizer {
const selectedRadio = Array.from(tokenizerRadios).find((radio) => radio.checked);
return (selectedRadio?.value ?? 'Word') as BuiltinTokenizer;
return (selectedRadio?.value ?? 'Markdown') as BuiltinTokenizer;
}
function resizeTextAreas(): void {

View file

@ -12,7 +12,7 @@ import {
import wasmBytes from 'reconcile-text/reconcile_text_bg.wasm';
// Define the enum values as const arrays to avoid duplication
const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Word'] as const;
const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Markdown', 'Word'] as const;
const HISTORY_VALUES = [
'Unchanged',
'AddedFromLeft',

View file

@ -1,10 +1,10 @@
use std::{fmt::Debug, vec};
use std::fmt::Debug;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::{
BuiltinTokenizer, CursorPosition, TextWithCursors,
BuiltinTokenizer, CursorPosition, TextWithCursors, Token,
operation_transformation::{
DiffError, Operation,
utils::{cook_operations::cook_operations, elongate_operations::elongate_operations},
@ -55,6 +55,7 @@ where
{
/// Create an `EditedText` from the given original and updated strings
/// using the provided tokenizer
#[must_use]
pub fn from_strings_with_tokenizer(
original: &'a str,
updated: &TextWithCursors,
@ -134,24 +135,21 @@ where
let mut last_right_op = None;
loop {
let (side, operation, mut last_other_op) =
match (maybe_left_op.clone(), maybe_right_op.clone()) {
(Some(left_op), Some(right_op)) => {
if left_op
.get_sort_key(seen_left_length)
.partial_cmp(&right_op.get_sort_key(seen_right_length))
== Some(std::cmp::Ordering::Less)
{
(Side::Left, left_op, last_right_op.clone())
} else {
(Side::Right, right_op, last_left_op.clone())
}
let (side, operation) = match (maybe_left_op.as_ref(), maybe_right_op.as_ref()) {
(Some(left_op), Some(right_op)) => {
if left_op.cmp_priority(seen_left_length, right_op, seen_right_length)
== std::cmp::Ordering::Less
{
(Side::Left, maybe_left_op.take().unwrap())
} else {
(Side::Right, maybe_right_op.take().unwrap())
}
}
(Some(left_op), None) => (Side::Left, left_op, last_right_op.clone()),
(None, Some(right_op)) => (Side::Right, right_op, last_left_op.clone()),
(None, None) => break,
};
(Some(_), None) => (Side::Left, maybe_left_op.take().unwrap()),
(None, Some(_)) => (Side::Right, maybe_right_op.take().unwrap()),
(None, None) => break,
};
let is_advancing_operation = matches!(
operation,
@ -161,7 +159,7 @@ where
let original_length = operation.len();
let (side, result) = match side {
Side::Left => {
let result = operation.merge_operations(&mut last_other_op);
let result = operation.merge_operations(last_right_op.as_ref());
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
let merged_length_signed = isize::try_from(merged_length)
@ -195,7 +193,7 @@ where
(Side::Left, result)
}
Side::Right => {
let result = operation.merge_operations(&mut last_other_op);
let result = operation.merge_operations(last_left_op.as_ref());
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
let merged_length_signed = isize::try_from(merged_length)
@ -304,6 +302,7 @@ where
/// ```
#[must_use]
pub fn apply_with_history(&self) -> Vec<SpanWithHistory> {
let chars: Vec<char> = self.text.chars().collect();
let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
let mut history = Vec::with_capacity(self.operations.len());
@ -315,34 +314,26 @@ where
Operation::Equal { .. } => {
history.push(SpanWithHistory::new(builder.take(), History::Unchanged));
}
Operation::Insert { .. } => match side {
Side::Left => {
history.push(SpanWithHistory::new(builder.take(), History::AddedFromLeft));
}
Side::Right => history.push(SpanWithHistory::new(
builder.take(),
History::AddedFromRight,
)),
},
Operation::Insert { .. } => {
let h = match side {
Side::Left => History::AddedFromLeft,
Side::Right => History::AddedFromRight,
};
history.push(SpanWithHistory::new(builder.take(), h));
}
Operation::Delete {
deleted_character_count,
order,
..
} => {
let deleted: String = self
.text
.chars()
.skip(*order)
.take(*deleted_character_count)
let deleted: String = chars[*order..*order + *deleted_character_count]
.iter()
.collect();
match side {
Side::Left => {
history.push(SpanWithHistory::new(deleted, History::RemovedFromLeft));
}
Side::Right => {
history.push(SpanWithHistory::new(deleted, History::RemovedFromRight));
}
}
let h = match side {
Side::Left => History::RemovedFromLeft,
Side::Right => History::RemovedFromRight,
};
history.push(SpanWithHistory::new(deleted, h));
}
}
}
@ -350,6 +341,56 @@ where
history
}
/// Apply the operations and return both the merged text with cursors and
/// the provenance history in a single pass
#[must_use]
pub fn apply_with_all(&self) -> (TextWithCursors, Vec<SpanWithHistory>) {
let chars: Vec<char> = self.text.chars().collect();
let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
let mut history = Vec::with_capacity(self.operations.len());
let mut full_text = String::new();
for (operation, side) in self.operations.iter().zip(self.operation_sides.iter()) {
builder = operation.apply(builder);
match operation {
Operation::Equal { .. } => {
let span = builder.take();
full_text.push_str(&span);
history.push(SpanWithHistory::new(span, History::Unchanged));
}
Operation::Insert { .. } => {
let span = builder.take();
full_text.push_str(&span);
let h = match side {
Side::Left => History::AddedFromLeft,
Side::Right => History::AddedFromRight,
};
history.push(SpanWithHistory::new(span, h));
}
Operation::Delete {
deleted_character_count,
order,
..
} => {
let deleted: String = chars[*order..*order + *deleted_character_count]
.iter()
.collect();
let h = match side {
Side::Left => History::RemovedFromLeft,
Side::Right => History::RemovedFromRight,
};
history.push(SpanWithHistory::new(deleted, h));
}
}
}
(
TextWithCursors::new(full_text, self.cursors.clone()),
history,
)
}
/// Convert the `EditedText` into a terse representation ready for
/// serialization. The result omits cursor positions and the original text.
/// This is useful for sending text diffs over the network if there's a
@ -358,11 +399,11 @@ where
/// Inserts are strings, deletes are negative integers (character count),
/// and retained spans are positive integers (character count).
///
/// # Panics
/// # Errors
///
/// Panics if there's an integer overflow in i64.
#[must_use]
pub fn to_diff(&self) -> Vec<NumberOrText> {
/// Returns `DiffError::IntegerOverflow` if a character count exceeds
/// `i64::MAX`.
pub fn to_diff(&self) -> Result<Vec<NumberOrText>, DiffError> {
let mut result: Vec<NumberOrText> = Vec::with_capacity(self.operations.len());
let mut previous_equal: Option<usize> = None;
@ -378,16 +419,14 @@ where
Operation::Insert { text, .. } => {
if let Some(prev_length) = previous_equal {
result.push(NumberOrText::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
result
.push(NumberOrText::Number(i64::try_from(prev_length).map_err(
|_| DiffError::IntegerOverflow { value: prev_length },
)?));
previous_equal = None;
}
let text: String = text
.iter()
.map(super::super::tokenizer::token::Token::original)
.collect();
let text: String = text.iter().map(Token::original).collect();
result.push(NumberOrText::Text(text));
}
@ -396,26 +435,31 @@ where
..
} => {
if let Some(prev_length) = previous_equal {
result.push(NumberOrText::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
result
.push(NumberOrText::Number(i64::try_from(prev_length).map_err(
|_| DiffError::IntegerOverflow { value: prev_length },
)?));
previous_equal = None;
}
let count = i64::try_from(*deleted_character_count)
.expect("deleted_character_count must fit in i64");
let count = i64::try_from(*deleted_character_count).map_err(|_| {
DiffError::IntegerOverflow {
value: *deleted_character_count,
}
})?;
result.push(NumberOrText::Number(-count));
}
}
}
if let Some(prev_length) = previous_equal {
result.push(NumberOrText::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
result
.push(NumberOrText::Number(i64::try_from(prev_length).map_err(
|_| DiffError::IntegerOverflow { value: prev_length },
)?));
}
result
Ok(result)
}
/// Reconstruct an `EditedText` from a diff and the original text.
@ -435,7 +479,8 @@ where
) -> Result<EditedText<'a, T>, DiffError> {
let mut operations: Vec<Operation<T>> = Vec::with_capacity(diff.len());
let mut order = 0;
let text_length = original_text.chars().count();
let chars: Vec<char> = original_text.chars().collect();
let text_length = chars.len();
for item in diff {
match item {
@ -453,7 +498,7 @@ where
}
let original_characters: String =
original_text.chars().skip(order).take(length).collect();
chars[order..order + length].iter().collect();
let original_tokens = tokenizer(&original_characters);
for token in original_tokens {
@ -590,7 +635,7 @@ mod tests {
let original = "Merging text is hard!";
let changes = "Merging text is easy with reconcile!";
let result = EditedText::from_strings(original, &changes.into());
let serialized = serde_yaml::to_string(&result.to_diff()).unwrap();
let serialized = serde_yaml::to_string(&result.to_diff().unwrap()).unwrap();
let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",);
assert_eq!(serialized, expected);
@ -622,7 +667,7 @@ mod tests {
let edited_text = EditedText::from_strings(original, &updated.into());
let changes = edited_text.to_diff();
let changes = edited_text.to_diff().unwrap();
let deserialized_edited_text =
EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word).unwrap();

View file

@ -104,28 +104,55 @@ where
}
}
pub fn get_sort_key(&self, insertion_index: usize) -> (usize, usize, usize, String) {
(
self.order(),
match self {
Operation::Delete { .. } => 1,
Operation::Insert { .. } => 2,
Operation::Equal { .. } => 3,
},
insertion_index,
// Make sure that the ordering is deterministic regardless of which text
// is left or right.
match self {
Operation::Equal { length, .. } => length.to_string(),
Operation::Insert { text, .. } => {
text.iter().map(Token::original).collect::<String>()
}
fn type_priority(&self) -> u8 {
match self {
Operation::Delete { .. } => 1,
Operation::Insert { .. } => 2,
Operation::Equal { .. } => 3,
}
}
/// Compare two operations for processing order during merging. Uses
/// (order, type, `insertion_index`) with a deterministic content
/// tiebreaker that avoids allocating.
pub fn cmp_priority(
&self,
self_index: usize,
other: &Self,
other_index: usize,
) -> std::cmp::Ordering {
self.order()
.cmp(&other.order())
.then_with(|| self.type_priority().cmp(&other.type_priority()))
.then_with(|| self_index.cmp(&other_index))
.then_with(|| self.deterministic_content_cmp(other))
}
/// Deterministic tiebreaker based on operation content, so that merge
/// results are identical regardless of which side is left vs right
fn deterministic_content_cmp(&self, other: &Self) -> std::cmp::Ordering {
match (self, other) {
(Operation::Insert { text: t1, .. }, Operation::Insert { text: t2, .. }) => {
let s1 = t1.iter().flat_map(|t| t.original().chars());
let s2 = t2.iter().flat_map(|t| t.original().chars());
s1.cmp(s2)
}
(Operation::Equal { length: l1, .. }, Operation::Equal { length: l2, .. }) => {
l1.cmp(l2)
}
(
Operation::Delete {
deleted_character_count,
deleted_character_count: c1,
..
} => deleted_character_count.to_string(),
},
)
},
Operation::Delete {
deleted_character_count: c2,
..
},
) => c1.cmp(c2),
// Different types are already ordered by type_priority
_ => std::cmp::Ordering::Equal,
}
}
/// Applies the operation to the given `StringBuilder`, returning the
@ -193,10 +220,9 @@ where
}
/// Adjusts this operation based on `previous_operation` from the other side
/// to avoid duplicating or conflicting changes. Updates
/// `previous_operation` in-place.
/// to avoid duplicating or conflicting changes
#[allow(clippy::too_many_lines)]
pub fn merge_operations(self, previous_operation: &mut Option<Self>) -> Operation<T> {
pub fn merge_operations(self, previous_operation: Option<&Self>) -> Operation<T> {
let operation = self;
match (operation, previous_operation) {
@ -295,14 +321,36 @@ where
}
(
ref operation @ Operation::Equal { ref order, .. },
ref operation @ Operation::Equal {
ref order,
#[cfg(debug_assertions)]
ref text,
..
},
Some(Operation::Equal {
order: last_equal_order,
length: last_equal_length,
#[cfg(debug_assertions)]
text: last_equal_text,
..
}),
) => {
if operation.len() == *last_equal_length && *order == *last_equal_order {
// Both sides retained the same span from the original text,
// so we deduplicate by zeroing one out. This is safe because
// both EditedTexts are derived from the same original, and
// matching (order, length) means they cover the same substring
#[cfg(debug_assertions)]
debug_assert_eq!(
text, last_equal_text,
"Equal operations with same order and length should have the same text, \
but got {operation:?} vs {:?}",
Operation::<T>::Equal {
order: *last_equal_order,
length: *last_equal_length,
text: last_equal_text.clone(),
},
);
Operation::create_equal(*order, 0)
} else {
operation.clone()
@ -329,18 +377,20 @@ where
..
} => {
#[cfg(debug_assertions)]
write!(
f,
"<equal {} from {order}>",
text.as_ref()
.map(|text| format!("'{}'", text.replace('\n', "\\n")))
.unwrap_or(format!("{length} characters")),
)?;
{
write!(
f,
"<equal {} from {order}>",
text.as_ref()
.map(|text| format!("'{}'", text.replace('\n', "\\n")))
.unwrap_or(format!("{length} characters")),
)
}
#[cfg(not(debug_assertions))]
write!(f, "<equal {length} from {order}>")?;
Ok(())
{
write!(f, "<equal {length} from {order}>")
}
}
Operation::Insert { order, text, .. } => {
write!(
@ -361,22 +411,24 @@ where
..
} => {
#[cfg(debug_assertions)]
write!(
f,
"<delete {} from {order}>",
deleted_text
.as_ref()
.map(|text| format!("'{}'", text.replace('\n', "\\n")))
.unwrap_or(format!("{deleted_character_count} characters")),
)?;
{
write!(
f,
"<delete {} from {order}>",
deleted_text
.as_ref()
.map(|text| format!("'{}'", text.replace('\n', "\\n")))
.unwrap_or(format!("{deleted_character_count} characters")),
)
}
#[cfg(not(debug_assertions))]
write!(
f,
"<delete {deleted_character_count} characters from {order}>",
)?;
Ok(())
{
write!(
f,
"<delete {deleted_character_count} characters from {order}>",
)
}
}
}
}

View file

@ -0,0 +1,290 @@
use super::{token::Token, word_tokenizer::split_words};
/// Splits markdown text into tokens that respect markdown formatting structure
///
/// Builds on word-level tokenization with markdown-specific handling:
/// - Newlines are non-joinable tokens (preserves block structure)
/// - Block-level prefixes (headings, list markers, blockquotes) attach to the
/// first word of their line so they can't be split apart during merge
/// - Intra-line whitespace uses the same normalization as the word tokenizer
///
/// This prevents merges from breaking lists, headings, or other structural
/// markdown elements. Inline formatting like `**bold**` is already preserved
/// by word-level splitting since formatting markers contain no whitespace.
///
/// ## Example
///
/// ```not_rust
/// "# Hello\n- item" -> ["# Hello", "\n", "- item"]
/// ```
pub fn markdown_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result = Vec::new();
let segments = split_preserving_newlines(text);
for segment in &segments {
if *segment == "\n" || *segment == "\r\n" {
let s = (*segment).to_owned();
result.push(Token::new(s.clone(), s, false, false));
continue;
}
let prefix_len = block_prefix_len(segment);
let mut line_tokens = split_words(&segment[prefix_len..]);
if prefix_len > 0 {
let prefix = &segment[..prefix_len];
if line_tokens.is_empty() {
let s = prefix.to_owned();
result.push(Token::new(s.clone(), s, false, false));
} else {
let first = &line_tokens[0];
let combined_original = format!("{prefix}{}", first.original());
let combined_normalized = format!("{prefix}{}", first.normalized());
line_tokens[0] = Token::new(
combined_normalized,
combined_original,
false,
first.is_right_joinable,
);
}
}
result.extend(line_tokens);
}
// Normalize non-newline whitespace tokens by appending the next token's
// original text (same trick as the word tokenizer so each space is unique
// in the diff based on what follows it)
if !result.is_empty() {
for i in 0..result.len() - 1 {
if result[i]
.original()
.chars()
.all(|c| c.is_whitespace() && c != '\n' && c != '\r')
{
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
}
result
}
/// Splits text into alternating segments of line content and newline separators
fn split_preserving_newlines(text: &str) -> Vec<&str> {
let mut segments = Vec::new();
let mut line_start = 0;
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
if i > line_start {
segments.push(&text[line_start..i]);
}
segments.push(&text[i..i + 2]);
i += 2;
line_start = i;
} else if bytes[i] == b'\n' {
if i > line_start {
segments.push(&text[line_start..i]);
}
segments.push(&text[i..=i]);
i += 1;
line_start = i;
} else {
i += 1;
}
}
if line_start < text.len() {
segments.push(&text[line_start..]);
}
segments
}
/// Returns the byte length of a markdown block-level prefix at the start of a
/// line, or 0 if none is found
///
/// All recognized prefix characters are ASCII, so byte offsets are always
/// valid UTF-8 boundaries.
///
/// Recognized prefixes:
/// - ATX headings: `# ` through `###### `
/// - Blockquotes: `> ` (single level)
/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace)
/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace)
/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix)
fn block_prefix_len(line: &str) -> usize {
let trimmed = line.trim_start_matches([' ', '\t']);
let indent_len = line.len() - trimmed.len();
// ATX heading: #{1,6} followed by a space
if trimmed.starts_with('#') {
let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count();
if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') {
return indent_len + hash_count + 1;
}
}
// Blockquote: > followed by optional space
if trimmed.starts_with("> ") {
return indent_len + 2;
}
if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') {
return indent_len + 1;
}
// Unordered list: [-*+] followed by a space, optionally with task checkbox
if trimmed.len() >= 2 {
let first_byte = trimmed.as_bytes()[0];
if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' {
return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]);
}
}
// Ordered list: digits followed by [.)] and a space, optionally with task
// checkbox
let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count();
if digit_count > 0 && indent_len + digit_count + 2 <= line.len() {
let after_digits = trimmed.as_bytes()[digit_count];
let after_marker = trimmed.as_bytes().get(digit_count + 1);
if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') {
return indent_len
+ digit_count
+ 2
+ task_checkbox_len(&line[indent_len + digit_count + 2..]);
}
}
0
}
/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `)
/// at the start of `rest`, or 0 if none is found
fn task_checkbox_len(rest: &str) -> usize {
if rest.len() >= 4
&& rest.as_bytes()[0] == b'['
&& matches!(rest.as_bytes()[1], b' ' | b'x' | b'X')
&& rest.as_bytes()[2] == b']'
&& rest.as_bytes()[3] == b' '
{
4
} else {
0
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_plain_text() {
assert_debug_snapshot!(markdown_tokenizer("Hello world"));
}
#[test]
fn test_empty() {
assert_debug_snapshot!(markdown_tokenizer(""));
}
#[test]
fn test_headings() {
assert_debug_snapshot!(markdown_tokenizer("# Hello world"));
assert_debug_snapshot!(markdown_tokenizer("## Sub heading"));
assert_debug_snapshot!(markdown_tokenizer("###### Deep heading"));
}
#[test]
fn test_unordered_list() {
assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three"));
}
#[test]
fn test_ordered_list() {
assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third"));
}
#[test]
fn test_blockquote() {
assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted"));
}
#[test]
fn test_inline_formatting() {
assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text"));
}
#[test]
fn test_mixed_content() {
assert_debug_snapshot!(markdown_tokenizer(
"# Title\n\nSome text with **bold**.\n\n- list item\n- another item"
));
}
#[test]
fn test_indented_list() {
assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper"));
}
#[test]
fn test_crlf() {
assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2"));
}
#[test]
fn test_code_fence() {
assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```"));
}
#[test]
fn test_heading_only() {
assert_debug_snapshot!(markdown_tokenizer("# "));
}
#[test]
fn test_link() {
assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now"));
}
#[test]
fn test_multiline_paragraph() {
assert_debug_snapshot!(markdown_tokenizer(
"First line\nSecond line\n\nNew paragraph"
));
}
#[test]
fn test_list_with_star_marker() {
assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two"));
}
#[test]
fn test_bold_not_confused_with_list() {
assert_debug_snapshot!(markdown_tokenizer("**bold text**"));
}
#[test]
fn test_task_list() {
assert_debug_snapshot!(markdown_tokenizer(
"- [ ] todo\n- [x] done\n- [X] also done"
));
}
#[test]
fn test_ordered_task_list() {
assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task"));
}
#[test]
fn test_unicode() {
assert_debug_snapshot!(markdown_tokenizer(
"# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world"
));
}
}

View file

@ -0,0 +1,36 @@
---
source: src/tokenizer/line_tokenizer.rs
expression: "line_tokenizer(\"Old\\rMac\\rStyle\")"
---
[
Token {
normalized: "Old",
original: "Old",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\r",
original: "\r",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "Mac",
original: "Mac",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\r",
original: "\r",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "Style",
original: "Style",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,37 @@
---
source: src/tokenizer/line_tokenizer.rs
assertion_line: 78
expression: "line_tokenizer(\"Mixed\\r\\nand\\rbare\")"
---
[
Token {
normalized: "Mixed",
original: "Mixed",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\r\n",
original: "\r\n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "and",
original: "and",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\r",
original: "\r",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "bare",
original: "bare",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,48 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"> quoted text\\n> more quoted\")"
---
[
Token {
normalized: "> quoted",
original: "> quoted",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " text",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "text",
original: "text",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "> more",
original: "> more",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " quoted",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "quoted",
original: "quoted",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,24 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"**bold text**\")"
---
[
Token {
normalized: "**bold",
original: "**bold",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " text**",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "text**",
original: "text**",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,72 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"```rust\\nlet x = 1;\\n```\")"
---
[
Token {
normalized: "```rust",
original: "```rust",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "let",
original: "let",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " x",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "x",
original: "x",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " =",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "=",
original: "=",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " 1;",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "1;",
original: "1;",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "```",
original: "```",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,48 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"Line 1\\r\\nLine 2\")"
---
[
Token {
normalized: "Line",
original: "Line",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " 1",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "1",
original: "1",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\r\n",
original: "\r\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "Line",
original: "Line",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " 2",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "2",
original: "2",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,5 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"\")"
---
[]

View file

@ -0,0 +1,12 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"# \")"
---
[
Token {
normalized: "# ",
original: "# ",
is_left_joinable: false,
is_right_joinable: false,
},
]

View file

@ -0,0 +1,25 @@
---
source: src/tokenizer/markdown_tokenizer.rs
assertion_line: 199
expression: "markdown_tokenizer(\"## Sub heading\")"
---
[
Token {
normalized: "## Sub",
original: "## Sub",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " heading",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "heading",
original: "heading",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,24 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"# Hello world\")"
---
[
Token {
normalized: "# Hello",
original: "# Hello",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " world",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "world",
original: "world",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,36 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\" - nested item\\n - deeper\")"
---
[
Token {
normalized: " - nested",
original: " - nested",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " item",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "item",
original: "item",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: " - deeper",
original: " - deeper",
is_left_joinable: false,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,60 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"Some **bold** and *italic* text\")"
---
[
Token {
normalized: "Some",
original: "Some",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " **bold**",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "**bold**",
original: "**bold**",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " and",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "and",
original: "and",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " *italic*",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "*italic*",
original: "*italic*",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " text",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "text",
original: "text",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,36 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"Click [here](https://example.com) now\")"
---
[
Token {
normalized: "Click",
original: "Click",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " [here](https://example.com)",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "[here](https://example.com)",
original: "[here](https://example.com)",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " now",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "now",
original: "now",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,48 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"* item one\\n* item two\")"
---
[
Token {
normalized: "* item",
original: "* item",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " one",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "one",
original: "one",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "* item",
original: "* item",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " two",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "two",
original: "two",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,120 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"# Title\\n\\nSome text with **bold**.\\n\\n- list item\\n- another item\")"
---
[
Token {
normalized: "# Title",
original: "# Title",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "Some",
original: "Some",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " text",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "text",
original: "text",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " with",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "with",
original: "with",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " **bold**.",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "**bold**.",
original: "**bold**.",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- list",
original: "- list",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " item",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "item",
original: "item",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- another",
original: "- another",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " item",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "item",
original: "item",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,78 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"First line\\nSecond line\\n\\nNew paragraph\")"
---
[
Token {
normalized: "First",
original: "First",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " line",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "line",
original: "line",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "Second",
original: "Second",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " line",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "line",
original: "line",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "New",
original: "New",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " paragraph",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "paragraph",
original: "paragraph",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,36 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"1. first\\n2. second\\n3. third\")"
---
[
Token {
normalized: "1. first",
original: "1. first",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "2. second",
original: "2. second",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "3. third",
original: "3. third",
is_left_joinable: false,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,48 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"1. [ ] first task\\n2. [x] second task\")"
---
[
Token {
normalized: "1. [ ] first",
original: "1. [ ] first",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " task",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "task",
original: "task",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "2. [x] second",
original: "2. [x] second",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " task",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "task",
original: "task",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,24 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"Hello world\")"
---
[
Token {
normalized: "Hello",
original: "Hello",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: " world",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "world",
original: "world",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,48 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"- [ ] todo\\n- [x] done\\n- [X] also done\")"
---
[
Token {
normalized: "- [ ] todo",
original: "- [ ] todo",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- [x] done",
original: "- [x] done",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- [X] also",
original: "- [X] also",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " done",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "done",
original: "done",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,60 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"# \\u{1F600} Héllo\\n- \\u{00E9}lément\\n> \\u{4F60}\\u{597D} world\")"
---
[
Token {
normalized: "# 😀",
original: "# 😀",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " Héllo",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "Héllo",
original: "Héllo",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- élément",
original: "- élément",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "> 你好",
original: "> 你好",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " world",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "world",
original: "world",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,72 @@
---
source: src/tokenizer/markdown_tokenizer.rs
expression: "markdown_tokenizer(\"- item one\\n- item two\\n- item three\")"
---
[
Token {
normalized: "- item",
original: "- item",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " one",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "one",
original: "one",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- item",
original: "- item",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " two",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "two",
original: "two",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "\n",
original: "\n",
is_left_joinable: false,
is_right_joinable: false,
},
Token {
normalized: "- item",
original: "- item",
is_left_joinable: false,
is_right_joinable: true,
},
Token {
normalized: " three",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalized: "three",
original: "three",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -9,6 +9,26 @@ use super::token::Token;
/// "Hi there!" -> ["Hi", " ", "there!"]
/// ```
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result = split_words(text);
if result.is_empty() {
return result;
}
// normalize whitespace tokens by concatenating with the following token
for i in 0..result.len() - 1 {
if result[i].original().chars().all(char::is_whitespace) {
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
result
}
/// Splits text into alternating word and whitespace tokens without any
/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`.
pub(super) fn split_words(text: &str) -> Vec<Token<String>> {
let mut result = Vec::new();
let mut previous_boundary_index = 0;
@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
result.push(text[previous_boundary_index..].into());
}
if result.is_empty() {
return result;
}
// normalize whitespace tokens by concatenating with the following token
for i in 0..result.len() - 1 {
if result[i].original().chars().all(char::is_whitespace) {
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
result
}