Update tests

Minimise allocations
Update website
2026-03-11 20:43:41 +00:00 · 2026-03-11 20:43:34 +00:00 · 2026-03-11 20:39:36 +00:00 · 2026-03-11 20:39:14 +00:00 · 2026-03-11 20:39:04 +00:00
29 changed files with 1539 additions and 145 deletions
--- a/examples/website/src/index.html
+++ b/examples/website/src/index.html
@ -108,13 +108,7 @@
                </div>
              </label>
              <label class="radio-option">
-                <input
-                  type="radio"
-                  name="tokenizer"
-                  value="Word"
-                  id="tokenizer-word"
-                  checked
-                />
+                <input type="radio" name="tokenizer" value="Word" id="tokenizer-word" />
                <span class="radio-custom" aria-hidden="true"></span>
                <div class="radio-content">
                  <span class="radio-label">Word</span>
@ -122,13 +116,17 @@
                </div>
              </label>
              <label class="radio-option">
-                <input type="radio" name="tokenizer" value="Line" id="tokenizer-line" />
+                <input
+                  type="radio"
+                  name="tokenizer"
+                  value="Markdown"
+                  id="tokenizer-markdown"
+                  checked
+                />
                <span class="radio-custom" aria-hidden="true"></span>
                <div class="radio-content">
-                  <span class="radio-label">Line</span>
-                  <span class="radio-description"
-                    >Line-by-line, like <code>git merge</code></span
-                  >
+                  <span class="radio-label">Markdown</span>
+                  <span class="radio-description">Preserve formatting</span>
                </div>
              </label>
            </div>
--- a/examples/website/src/index.ts
+++ b/examples/website/src/index.ts
@ -10,7 +10,11 @@ const tokenizerRadios = document.querySelectorAll(
  'input[name="tokenizer"]'
 ) as NodeListOf<HTMLInputElement>;

-const sampleText = `The "reconcile-text" Rust library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box. Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy, for example, deciding how casing or whitespace is taken into account.`;
+const sampleText = `The reconcile-text library is embedded on this page as a WASM module and powers these text boxes. Experiment with changing the "Original", "First user's edit", and "Second user's edit" text boxes to see competing changes get merged in real-time within the "Merged result" box. 
+
+Here, you will see color-coded tokens marking the origin of each token, including ones that got deleted. The result highly depends on the tokenisation strategy which may be:
+- Character-based
+- Word-based`;

 let pendingUpdate: number | null = null;
 function scheduleUpdate(): void {
@ -52,10 +56,10 @@ function loadSample(): void {
  originalTextArea.value = sampleText;
  leftTextArea.value =
    sampleText.replace('color', 'colour') +
-    " Check out what's the most complex conflict you can come up with!";
-  rightTextArea.value = sampleText
-    .replace(', for example,', ' such as')
-    .replace('WASM', 'WebAssembly');
+    "\n- Line-based\n\nCheck out what's the most complex conflict you can come up with!";
+  rightTextArea.value =
+    sampleText.replace(', for example,', ' such as').replace('WASM', 'WebAssembly') +
+    '\n- Or your custom tokeniser';
 }

 function updateMergedText(): void {
@ -191,7 +195,7 @@ function createSelectionOverlay(isLeft: boolean, isSelection: boolean): HTMLSpan

 function getSelectedTokenizer(): BuiltinTokenizer {
  const selectedRadio = Array.from(tokenizerRadios).find((radio) => radio.checked);
-  return (selectedRadio?.value ?? 'Word') as BuiltinTokenizer;
+  return (selectedRadio?.value ?? 'Markdown') as BuiltinTokenizer;
 }

 function resizeTextAreas(): void {
--- a/reconcile-js/src/index.ts
+++ b/reconcile-js/src/index.ts
@ -12,7 +12,7 @@ import {
 import wasmBytes from 'reconcile-text/reconcile_text_bg.wasm';

 // Define the enum values as const arrays to avoid duplication
-const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Word'] as const;
+const BUILTIN_TOKENIZERS = ['Character', 'Line', 'Markdown', 'Word'] as const;
 const HISTORY_VALUES = [
  'Unchanged',
  'AddedFromLeft',
--- a/src/operation_transformation/edited_text.rs
+++ b/src/operation_transformation/edited_text.rs
@ -1,10 +1,10 @@
-use std::{fmt::Debug, vec};
+use std::fmt::Debug;

 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};

 use crate::{
-    BuiltinTokenizer, CursorPosition, TextWithCursors,
+    BuiltinTokenizer, CursorPosition, TextWithCursors, Token,
    operation_transformation::{
        DiffError, Operation,
        utils::{cook_operations::cook_operations, elongate_operations::elongate_operations},
@ -55,6 +55,7 @@ where
 {
    /// Create an `EditedText` from the given original and updated strings
    /// using the provided tokenizer
+    #[must_use]
    pub fn from_strings_with_tokenizer(
        original: &'a str,
        updated: &TextWithCursors,
@ -134,24 +135,21 @@ where
        let mut last_right_op = None;

        loop {
-            let (side, operation, mut last_other_op) =
-                match (maybe_left_op.clone(), maybe_right_op.clone()) {
-                    (Some(left_op), Some(right_op)) => {
-                        if left_op
-                            .get_sort_key(seen_left_length)
-                            .partial_cmp(&right_op.get_sort_key(seen_right_length))
-                            == Some(std::cmp::Ordering::Less)
-                        {
-                            (Side::Left, left_op, last_right_op.clone())
-                        } else {
-                            (Side::Right, right_op, last_left_op.clone())
-                        }
+            let (side, operation) = match (maybe_left_op.as_ref(), maybe_right_op.as_ref()) {
+                (Some(left_op), Some(right_op)) => {
+                    if left_op.cmp_priority(seen_left_length, right_op, seen_right_length)
+                        == std::cmp::Ordering::Less
+                    {
+                        (Side::Left, maybe_left_op.take().unwrap())
+                    } else {
+                        (Side::Right, maybe_right_op.take().unwrap())
                    }
+                }

-                    (Some(left_op), None) => (Side::Left, left_op, last_right_op.clone()),
-                    (None, Some(right_op)) => (Side::Right, right_op, last_left_op.clone()),
-                    (None, None) => break,
-                };
+                (Some(_), None) => (Side::Left, maybe_left_op.take().unwrap()),
+                (None, Some(_)) => (Side::Right, maybe_right_op.take().unwrap()),
+                (None, None) => break,
+            };

            let is_advancing_operation = matches!(
                operation,
@ -161,7 +159,7 @@ where
            let original_length = operation.len();
            let (side, result) = match side {
                Side::Left => {
-                    let result = operation.merge_operations(&mut last_other_op);
+                    let result = operation.merge_operations(last_right_op.as_ref());

                    if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
                        let merged_length_signed = isize::try_from(merged_length)
@ -195,7 +193,7 @@ where
                    (Side::Left, result)
                }
                Side::Right => {
-                    let result = operation.merge_operations(&mut last_other_op);
+                    let result = operation.merge_operations(last_left_op.as_ref());

                    if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
                        let merged_length_signed = isize::try_from(merged_length)
@ -304,6 +302,7 @@ where
    /// ```
    #[must_use]
    pub fn apply_with_history(&self) -> Vec<SpanWithHistory> {
+        let chars: Vec<char> = self.text.chars().collect();
        let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);

        let mut history = Vec::with_capacity(self.operations.len());
@ -315,34 +314,26 @@ where
                Operation::Equal { .. } => {
                    history.push(SpanWithHistory::new(builder.take(), History::Unchanged));
                }
-                Operation::Insert { .. } => match side {
-                    Side::Left => {
-                        history.push(SpanWithHistory::new(builder.take(), History::AddedFromLeft));
-                    }
-                    Side::Right => history.push(SpanWithHistory::new(
-                        builder.take(),
-                        History::AddedFromRight,
-                    )),
-                },
+                Operation::Insert { .. } => {
+                    let h = match side {
+                        Side::Left => History::AddedFromLeft,
+                        Side::Right => History::AddedFromRight,
+                    };
+                    history.push(SpanWithHistory::new(builder.take(), h));
+                }
                Operation::Delete {
                    deleted_character_count,
                    order,
                    ..
                } => {
-                    let deleted: String = self
-                        .text
-                        .chars()
-                        .skip(*order)
-                        .take(*deleted_character_count)
+                    let deleted: String = chars[*order..*order + *deleted_character_count]
+                        .iter()
                        .collect();
-                    match side {
-                        Side::Left => {
-                            history.push(SpanWithHistory::new(deleted, History::RemovedFromLeft));
-                        }
-                        Side::Right => {
-                            history.push(SpanWithHistory::new(deleted, History::RemovedFromRight));
-                        }
-                    }
+                    let h = match side {
+                        Side::Left => History::RemovedFromLeft,
+                        Side::Right => History::RemovedFromRight,
+                    };
+                    history.push(SpanWithHistory::new(deleted, h));
                }
            }
        }
@ -350,6 +341,56 @@ where
        history
    }

+    /// Apply the operations and return both the merged text with cursors and
+    /// the provenance history in a single pass
+    #[must_use]
+    pub fn apply_with_all(&self) -> (TextWithCursors, Vec<SpanWithHistory>) {
+        let chars: Vec<char> = self.text.chars().collect();
+        let mut builder: StringBuilder<'_> = StringBuilder::new(self.text);
+        let mut history = Vec::with_capacity(self.operations.len());
+        let mut full_text = String::new();
+
+        for (operation, side) in self.operations.iter().zip(self.operation_sides.iter()) {
+            builder = operation.apply(builder);
+
+            match operation {
+                Operation::Equal { .. } => {
+                    let span = builder.take();
+                    full_text.push_str(&span);
+                    history.push(SpanWithHistory::new(span, History::Unchanged));
+                }
+                Operation::Insert { .. } => {
+                    let span = builder.take();
+                    full_text.push_str(&span);
+                    let h = match side {
+                        Side::Left => History::AddedFromLeft,
+                        Side::Right => History::AddedFromRight,
+                    };
+                    history.push(SpanWithHistory::new(span, h));
+                }
+                Operation::Delete {
+                    deleted_character_count,
+                    order,
+                    ..
+                } => {
+                    let deleted: String = chars[*order..*order + *deleted_character_count]
+                        .iter()
+                        .collect();
+                    let h = match side {
+                        Side::Left => History::RemovedFromLeft,
+                        Side::Right => History::RemovedFromRight,
+                    };
+                    history.push(SpanWithHistory::new(deleted, h));
+                }
+            }
+        }
+
+        (
+            TextWithCursors::new(full_text, self.cursors.clone()),
+            history,
+        )
+    }
+
    /// Convert the `EditedText` into a terse representation ready for
    /// serialization. The result omits cursor positions and the original text.
    /// This is useful for sending text diffs over the network if there's a
@ -358,11 +399,11 @@ where
    /// Inserts are strings, deletes are negative integers (character count),
    /// and retained spans are positive integers (character count).
    ///
-    /// # Panics
+    /// # Errors
    ///
-    /// Panics if there's an integer overflow in i64.
-    #[must_use]
-    pub fn to_diff(&self) -> Vec<NumberOrText> {
+    /// Returns `DiffError::IntegerOverflow` if a character count exceeds
+    /// `i64::MAX`.
+    pub fn to_diff(&self) -> Result<Vec<NumberOrText>, DiffError> {
        let mut result: Vec<NumberOrText> = Vec::with_capacity(self.operations.len());
        let mut previous_equal: Option<usize> = None;

@ -378,16 +419,14 @@ where

                Operation::Insert { text, .. } => {
                    if let Some(prev_length) = previous_equal {
-                        result.push(NumberOrText::Number(
-                            i64::try_from(prev_length).expect("prev_length must fit in i64"),
-                        ));
+                        result
+                            .push(NumberOrText::Number(i64::try_from(prev_length).map_err(
+                                |_| DiffError::IntegerOverflow { value: prev_length },
+                            )?));
                        previous_equal = None;
                    }

-                    let text: String = text
-                        .iter()
-                        .map(super::super::tokenizer::token::Token::original)
-                        .collect();
+                    let text: String = text.iter().map(Token::original).collect();
                    result.push(NumberOrText::Text(text));
                }

@ -396,26 +435,31 @@ where
                    ..
                } => {
                    if let Some(prev_length) = previous_equal {
-                        result.push(NumberOrText::Number(
-                            i64::try_from(prev_length).expect("prev_length must fit in i64"),
-                        ));
+                        result
+                            .push(NumberOrText::Number(i64::try_from(prev_length).map_err(
+                                |_| DiffError::IntegerOverflow { value: prev_length },
+                            )?));
                        previous_equal = None;
                    }

-                    let count = i64::try_from(*deleted_character_count)
-                        .expect("deleted_character_count must fit in i64");
+                    let count = i64::try_from(*deleted_character_count).map_err(|_| {
+                        DiffError::IntegerOverflow {
+                            value: *deleted_character_count,
+                        }
+                    })?;
                    result.push(NumberOrText::Number(-count));
                }
            }
        }

        if let Some(prev_length) = previous_equal {
-            result.push(NumberOrText::Number(
-                i64::try_from(prev_length).expect("prev_length must fit in i64"),
-            ));
+            result
+                .push(NumberOrText::Number(i64::try_from(prev_length).map_err(
+                    |_| DiffError::IntegerOverflow { value: prev_length },
+                )?));
        }

-        result
+        Ok(result)
    }

    /// Reconstruct an `EditedText` from a diff and the original text.
@ -435,7 +479,8 @@ where
    ) -> Result<EditedText<'a, T>, DiffError> {
        let mut operations: Vec<Operation<T>> = Vec::with_capacity(diff.len());
        let mut order = 0;
-        let text_length = original_text.chars().count();
+        let chars: Vec<char> = original_text.chars().collect();
+        let text_length = chars.len();

        for item in diff {
            match item {
@ -453,7 +498,7 @@ where
                        }

                        let original_characters: String =
-                            original_text.chars().skip(order).take(length).collect();
+                            chars[order..order + length].iter().collect();

                        let original_tokens = tokenizer(&original_characters);
                        for token in original_tokens {
@ -590,7 +635,7 @@ mod tests {
        let original = "Merging text is hard!";
        let changes = "Merging text is easy with reconcile!";
        let result = EditedText::from_strings(original, &changes.into());
-        let serialized = serde_yaml::to_string(&result.to_diff()).unwrap();
+        let serialized = serde_yaml::to_string(&result.to_diff().unwrap()).unwrap();

        let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",);
        assert_eq!(serialized, expected);
@ -622,7 +667,7 @@ mod tests {

        let edited_text = EditedText::from_strings(original, &updated.into());

-        let changes = edited_text.to_diff();
+        let changes = edited_text.to_diff().unwrap();
        let deserialized_edited_text =
            EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word).unwrap();

--- a/src/operation_transformation/operation.rs
+++ b/src/operation_transformation/operation.rs
@ -104,28 +104,55 @@ where
        }
    }

-    pub fn get_sort_key(&self, insertion_index: usize) -> (usize, usize, usize, String) {
-        (
-            self.order(),
-            match self {
-                Operation::Delete { .. } => 1,
-                Operation::Insert { .. } => 2,
-                Operation::Equal { .. } => 3,
-            },
-            insertion_index,
-            // Make sure that the ordering is deterministic regardless of which text
-            // is left or right.
-            match self {
-                Operation::Equal { length, .. } => length.to_string(),
-                Operation::Insert { text, .. } => {
-                    text.iter().map(Token::original).collect::<String>()
-                }
+    fn type_priority(&self) -> u8 {
+        match self {
+            Operation::Delete { .. } => 1,
+            Operation::Insert { .. } => 2,
+            Operation::Equal { .. } => 3,
+        }
+    }
+
+    /// Compare two operations for processing order during merging. Uses
+    /// (order, type, `insertion_index`) with a deterministic content
+    /// tiebreaker that avoids allocating.
+    pub fn cmp_priority(
+        &self,
+        self_index: usize,
+        other: &Self,
+        other_index: usize,
+    ) -> std::cmp::Ordering {
+        self.order()
+            .cmp(&other.order())
+            .then_with(|| self.type_priority().cmp(&other.type_priority()))
+            .then_with(|| self_index.cmp(&other_index))
+            .then_with(|| self.deterministic_content_cmp(other))
+    }
+
+    /// Deterministic tiebreaker based on operation content, so that merge
+    /// results are identical regardless of which side is left vs right
+    fn deterministic_content_cmp(&self, other: &Self) -> std::cmp::Ordering {
+        match (self, other) {
+            (Operation::Insert { text: t1, .. }, Operation::Insert { text: t2, .. }) => {
+                let s1 = t1.iter().flat_map(|t| t.original().chars());
+                let s2 = t2.iter().flat_map(|t| t.original().chars());
+                s1.cmp(s2)
+            }
+            (Operation::Equal { length: l1, .. }, Operation::Equal { length: l2, .. }) => {
+                l1.cmp(l2)
+            }
+            (
                Operation::Delete {
-                    deleted_character_count,
+                    deleted_character_count: c1,
                    ..
-                } => deleted_character_count.to_string(),
-            },
-        )
+                },
+                Operation::Delete {
+                    deleted_character_count: c2,
+                    ..
+                },
+            ) => c1.cmp(c2),
+            // Different types are already ordered by type_priority
+            _ => std::cmp::Ordering::Equal,
+        }
    }

    /// Applies the operation to the given `StringBuilder`, returning the
@ -193,10 +220,9 @@ where
    }

    /// Adjusts this operation based on `previous_operation` from the other side
-    /// to avoid duplicating or conflicting changes. Updates
-    /// `previous_operation` in-place.
+    /// to avoid duplicating or conflicting changes
    #[allow(clippy::too_many_lines)]
-    pub fn merge_operations(self, previous_operation: &mut Option<Self>) -> Operation<T> {
+    pub fn merge_operations(self, previous_operation: Option<&Self>) -> Operation<T> {
        let operation = self;

        match (operation, previous_operation) {
@ -295,14 +321,36 @@ where
            }

            (
-                ref operation @ Operation::Equal { ref order, .. },
+                ref operation @ Operation::Equal {
+                    ref order,
+                    #[cfg(debug_assertions)]
+                    ref text,
+                    ..
+                },
                Some(Operation::Equal {
                    order: last_equal_order,
                    length: last_equal_length,
+                    #[cfg(debug_assertions)]
+                    text: last_equal_text,
                    ..
                }),
            ) => {
                if operation.len() == *last_equal_length && *order == *last_equal_order {
+                    // Both sides retained the same span from the original text,
+                    // so we deduplicate by zeroing one out. This is safe because
+                    // both EditedTexts are derived from the same original, and
+                    // matching (order, length) means they cover the same substring
+                    #[cfg(debug_assertions)]
+                    debug_assert_eq!(
+                        text, last_equal_text,
+                        "Equal operations with same order and length should have the same text, \
+                         but got {operation:?} vs {:?}",
+                        Operation::<T>::Equal {
+                            order: *last_equal_order,
+                            length: *last_equal_length,
+                            text: last_equal_text.clone(),
+                        },
+                    );
                    Operation::create_equal(*order, 0)
                } else {
                    operation.clone()
@ -329,18 +377,20 @@ where
                ..
            } => {
                #[cfg(debug_assertions)]
-                write!(
-                    f,
-                    "<equal {} from {order}>",
-                    text.as_ref()
-                        .map(|text| format!("'{}'", text.replace('\n', "\\n")))
-                        .unwrap_or(format!("{length} characters")),
-                )?;
+                {
+                    write!(
+                        f,
+                        "<equal {} from {order}>",
+                        text.as_ref()
+                            .map(|text| format!("'{}'", text.replace('\n', "\\n")))
+                            .unwrap_or(format!("{length} characters")),
+                    )
+                }

                #[cfg(not(debug_assertions))]
-                write!(f, "<equal {length} from {order}>")?;
-
-                Ok(())
+                {
+                    write!(f, "<equal {length} from {order}>")
+                }
            }
            Operation::Insert { order, text, .. } => {
                write!(
@ -361,22 +411,24 @@ where
                ..
            } => {
                #[cfg(debug_assertions)]
-                write!(
-                    f,
-                    "<delete {} from {order}>",
-                    deleted_text
-                        .as_ref()
-                        .map(|text| format!("'{}'", text.replace('\n', "\\n")))
-                        .unwrap_or(format!("{deleted_character_count} characters")),
-                )?;
+                {
+                    write!(
+                        f,
+                        "<delete {} from {order}>",
+                        deleted_text
+                            .as_ref()
+                            .map(|text| format!("'{}'", text.replace('\n', "\\n")))
+                            .unwrap_or(format!("{deleted_character_count} characters")),
+                    )
+                }

                #[cfg(not(debug_assertions))]
-                write!(
-                    f,
-                    "<delete {deleted_character_count} characters from {order}>",
-                )?;
-
-                Ok(())
+                {
+                    write!(
+                        f,
+                        "<delete {deleted_character_count} characters from {order}>",
+                    )
+                }
            }
        }
    }
--- a/src/tokenizer/markdown_tokenizer.rs
+++ b/src/tokenizer/markdown_tokenizer.rs
@ -0,0 +1,290 @@
+use super::{token::Token, word_tokenizer::split_words};
+
+/// Splits markdown text into tokens that respect markdown formatting structure
+///
+/// Builds on word-level tokenization with markdown-specific handling:
+/// - Newlines are non-joinable tokens (preserves block structure)
+/// - Block-level prefixes (headings, list markers, blockquotes) attach to the
+///   first word of their line so they can't be split apart during merge
+/// - Intra-line whitespace uses the same normalization as the word tokenizer
+///
+/// This prevents merges from breaking lists, headings, or other structural
+/// markdown elements. Inline formatting like `**bold**` is already preserved
+/// by word-level splitting since formatting markers contain no whitespace.
+///
+/// ## Example
+///
+/// ```not_rust
+/// "# Hello\n- item" -> ["# Hello", "\n", "- item"]
+/// ```
+pub fn markdown_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result = Vec::new();
+    let segments = split_preserving_newlines(text);
+
+    for segment in &segments {
+        if *segment == "\n" || *segment == "\r\n" {
+            let s = (*segment).to_owned();
+            result.push(Token::new(s.clone(), s, false, false));
+            continue;
+        }
+
+        let prefix_len = block_prefix_len(segment);
+        let mut line_tokens = split_words(&segment[prefix_len..]);
+
+        if prefix_len > 0 {
+            let prefix = &segment[..prefix_len];
+            if line_tokens.is_empty() {
+                let s = prefix.to_owned();
+                result.push(Token::new(s.clone(), s, false, false));
+            } else {
+                let first = &line_tokens[0];
+                let combined_original = format!("{prefix}{}", first.original());
+                let combined_normalized = format!("{prefix}{}", first.normalized());
+                line_tokens[0] = Token::new(
+                    combined_normalized,
+                    combined_original,
+                    false,
+                    first.is_right_joinable,
+                );
+            }
+        }
+
+        result.extend(line_tokens);
+    }
+
+    // Normalize non-newline whitespace tokens by appending the next token's
+    // original text (same trick as the word tokenizer so each space is unique
+    // in the diff based on what follows it)
+    if !result.is_empty() {
+        for i in 0..result.len() - 1 {
+            if result[i]
+                .original()
+                .chars()
+                .all(|c| c.is_whitespace() && c != '\n' && c != '\r')
+            {
+                let normalized = result[i].normalized().to_owned() + result[i + 1].original();
+                result[i].set_normalized(normalized);
+            }
+        }
+    }
+
+    result
+}
+
+/// Splits text into alternating segments of line content and newline separators
+fn split_preserving_newlines(text: &str) -> Vec<&str> {
+    let mut segments = Vec::new();
+    let mut line_start = 0;
+    let bytes = text.as_bytes();
+    let mut i = 0;
+
+    while i < bytes.len() {
+        if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
+            if i > line_start {
+                segments.push(&text[line_start..i]);
+            }
+            segments.push(&text[i..i + 2]);
+            i += 2;
+            line_start = i;
+        } else if bytes[i] == b'\n' {
+            if i > line_start {
+                segments.push(&text[line_start..i]);
+            }
+            segments.push(&text[i..=i]);
+            i += 1;
+            line_start = i;
+        } else {
+            i += 1;
+        }
+    }
+
+    if line_start < text.len() {
+        segments.push(&text[line_start..]);
+    }
+
+    segments
+}
+
+/// Returns the byte length of a markdown block-level prefix at the start of a
+/// line, or 0 if none is found
+///
+/// All recognized prefix characters are ASCII, so byte offsets are always
+/// valid UTF-8 boundaries.
+///
+/// Recognized prefixes:
+/// - ATX headings: `# ` through `###### `
+/// - Blockquotes: `> ` (single level)
+/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace)
+/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace)
+/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix)
+fn block_prefix_len(line: &str) -> usize {
+    let trimmed = line.trim_start_matches([' ', '\t']);
+    let indent_len = line.len() - trimmed.len();
+
+    // ATX heading: #{1,6} followed by a space
+    if trimmed.starts_with('#') {
+        let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count();
+        if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') {
+            return indent_len + hash_count + 1;
+        }
+    }
+
+    // Blockquote: > followed by optional space
+    if trimmed.starts_with("> ") {
+        return indent_len + 2;
+    }
+    if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') {
+        return indent_len + 1;
+    }
+
+    // Unordered list: [-*+] followed by a space, optionally with task checkbox
+    if trimmed.len() >= 2 {
+        let first_byte = trimmed.as_bytes()[0];
+        if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' {
+            return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]);
+        }
+    }
+
+    // Ordered list: digits followed by [.)] and a space, optionally with task
+    // checkbox
+    let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count();
+    if digit_count > 0 && indent_len + digit_count + 2 <= line.len() {
+        let after_digits = trimmed.as_bytes()[digit_count];
+        let after_marker = trimmed.as_bytes().get(digit_count + 1);
+        if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') {
+            return indent_len
+                + digit_count
+                + 2
+                + task_checkbox_len(&line[indent_len + digit_count + 2..]);
+        }
+    }
+
+    0
+}
+
+/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `)
+/// at the start of `rest`, or 0 if none is found
+fn task_checkbox_len(rest: &str) -> usize {
+    if rest.len() >= 4
+        && rest.as_bytes()[0] == b'['
+        && matches!(rest.as_bytes()[1], b' ' | b'x' | b'X')
+        && rest.as_bytes()[2] == b']'
+        && rest.as_bytes()[3] == b' '
+    {
+        4
+    } else {
+        0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_plain_text() {
+        assert_debug_snapshot!(markdown_tokenizer("Hello world"));
+    }
+
+    #[test]
+    fn test_empty() {
+        assert_debug_snapshot!(markdown_tokenizer(""));
+    }
+
+    #[test]
+    fn test_headings() {
+        assert_debug_snapshot!(markdown_tokenizer("# Hello world"));
+        assert_debug_snapshot!(markdown_tokenizer("## Sub heading"));
+        assert_debug_snapshot!(markdown_tokenizer("###### Deep heading"));
+    }
+
+    #[test]
+    fn test_unordered_list() {
+        assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three"));
+    }
+
+    #[test]
+    fn test_ordered_list() {
+        assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third"));
+    }
+
+    #[test]
+    fn test_blockquote() {
+        assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted"));
+    }
+
+    #[test]
+    fn test_inline_formatting() {
+        assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text"));
+    }
+
+    #[test]
+    fn test_mixed_content() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "# Title\n\nSome text with **bold**.\n\n- list item\n- another item"
+        ));
+    }
+
+    #[test]
+    fn test_indented_list() {
+        assert_debug_snapshot!(markdown_tokenizer("  - nested item\n    - deeper"));
+    }
+
+    #[test]
+    fn test_crlf() {
+        assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2"));
+    }
+
+    #[test]
+    fn test_code_fence() {
+        assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```"));
+    }
+
+    #[test]
+    fn test_heading_only() {
+        assert_debug_snapshot!(markdown_tokenizer("# "));
+    }
+
+    #[test]
+    fn test_link() {
+        assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now"));
+    }
+
+    #[test]
+    fn test_multiline_paragraph() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "First line\nSecond line\n\nNew paragraph"
+        ));
+    }
+
+    #[test]
+    fn test_list_with_star_marker() {
+        assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two"));
+    }
+
+    #[test]
+    fn test_bold_not_confused_with_list() {
+        assert_debug_snapshot!(markdown_tokenizer("**bold text**"));
+    }
+
+    #[test]
+    fn test_task_list() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "- [ ] todo\n- [x] done\n- [X] also done"
+        ));
+    }
+
+    #[test]
+    fn test_ordered_task_list() {
+        assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task"));
+    }
+
+    #[test]
+    fn test_unicode() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world"
+        ));
+    }
+}
--- a/src/tokenizer/snapshots/reconcile_texttokenizerline_tokenizertestswith_snapshots-10.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizerline_tokenizertestswith_snapshots-10.snap
@ -0,0 +1,36 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+expression: "line_tokenizer(\"Old\\rMac\\rStyle\")"
+---
+[
+    Token {
+        normalized: "Old",
+        original: "Old",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\r",
+        original: "\r",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "Mac",
+        original: "Mac",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\r",
+        original: "\r",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "Style",
+        original: "Style",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizerline_tokenizertestswith_snapshots-11.snap.new
+++ b/src/tokenizer/snapshots/reconcile_texttokenizerline_tokenizertestswith_snapshots-11.snap.new
@ -0,0 +1,37 @@
+---
+source: src/tokenizer/line_tokenizer.rs
+assertion_line: 78
+expression: "line_tokenizer(\"Mixed\\r\\nand\\rbare\")"
+---
+[
+    Token {
+        normalized: "Mixed",
+        original: "Mixed",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\r\n",
+        original: "\r\n",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "and",
+        original: "and",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\r",
+        original: "\r",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "bare",
+        original: "bare",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsblockquote.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsblockquote.snap
@ -0,0 +1,48 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"> quoted text\\n> more quoted\")"
+---
+[
+    Token {
+        normalized: "> quoted",
+        original: "> quoted",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " text",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "text",
+        original: "text",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "> more",
+        original: "> more",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " quoted",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "quoted",
+        original: "quoted",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsbold_not_confused_with_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsbold_not_confused_with_list.snap
@ -0,0 +1,24 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"**bold text**\")"
+---
+[
+    Token {
+        normalized: "**bold",
+        original: "**bold",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " text**",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "text**",
+        original: "text**",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestscode_fence.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestscode_fence.snap
@ -0,0 +1,72 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"```rust\\nlet x = 1;\\n```\")"
+---
+[
+    Token {
+        normalized: "```rust",
+        original: "```rust",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "let",
+        original: "let",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " x",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "x",
+        original: "x",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " =",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "=",
+        original: "=",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " 1;",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "1;",
+        original: "1;",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "```",
+        original: "```",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestscrlf.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestscrlf.snap
@ -0,0 +1,48 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"Line 1\\r\\nLine 2\")"
+---
+[
+    Token {
+        normalized: "Line",
+        original: "Line",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " 1",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "1",
+        original: "1",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\r\n",
+        original: "\r\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "Line",
+        original: "Line",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " 2",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "2",
+        original: "2",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsempty.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsempty.snap
@ -0,0 +1,5 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"\")"
+---
+[]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheading_only.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheading_only.snap
@ -0,0 +1,12 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"# \")"
+---
+[
+    Token {
+        normalized: "# ",
+        original: "# ",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheadings-2.snap.new
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheadings-2.snap.new
@ -0,0 +1,25 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+assertion_line: 199
+expression: "markdown_tokenizer(\"## Sub heading\")"
+---
+[
+    Token {
+        normalized: "## Sub",
+        original: "## Sub",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " heading",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "heading",
+        original: "heading",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheadings.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsheadings.snap
@ -0,0 +1,24 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"# Hello world\")"
+---
+[
+    Token {
+        normalized: "# Hello",
+        original: "# Hello",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " world",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "world",
+        original: "world",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsindented_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsindented_list.snap
@ -0,0 +1,36 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"  - nested item\\n    - deeper\")"
+---
+[
+    Token {
+        normalized: "  - nested",
+        original: "  - nested",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " item",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "item",
+        original: "item",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "    - deeper",
+        original: "    - deeper",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsinline_formatting.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsinline_formatting.snap
@ -0,0 +1,60 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"Some **bold** and *italic* text\")"
+---
+[
+    Token {
+        normalized: "Some",
+        original: "Some",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " **bold**",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "**bold**",
+        original: "**bold**",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " and",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "and",
+        original: "and",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " *italic*",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "*italic*",
+        original: "*italic*",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " text",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "text",
+        original: "text",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestslink.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestslink.snap
@ -0,0 +1,36 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"Click [here](https://example.com) now\")"
+---
+[
+    Token {
+        normalized: "Click",
+        original: "Click",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " [here](https://example.com)",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "[here](https://example.com)",
+        original: "[here](https://example.com)",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " now",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "now",
+        original: "now",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestslist_with_star_marker.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestslist_with_star_marker.snap
@ -0,0 +1,48 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"* item one\\n* item two\")"
+---
+[
+    Token {
+        normalized: "* item",
+        original: "* item",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " one",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "one",
+        original: "one",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "* item",
+        original: "* item",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " two",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "two",
+        original: "two",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsmixed_content.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsmixed_content.snap
@ -0,0 +1,120 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"# Title\\n\\nSome text with **bold**.\\n\\n- list item\\n- another item\")"
+---
+[
+    Token {
+        normalized: "# Title",
+        original: "# Title",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "Some",
+        original: "Some",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " text",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "text",
+        original: "text",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " with",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "with",
+        original: "with",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " **bold**.",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "**bold**.",
+        original: "**bold**.",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- list",
+        original: "- list",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " item",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "item",
+        original: "item",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- another",
+        original: "- another",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " item",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "item",
+        original: "item",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsmultiline_paragraph.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsmultiline_paragraph.snap
@ -0,0 +1,78 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"First line\\nSecond line\\n\\nNew paragraph\")"
+---
+[
+    Token {
+        normalized: "First",
+        original: "First",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " line",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "line",
+        original: "line",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "Second",
+        original: "Second",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " line",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "line",
+        original: "line",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "New",
+        original: "New",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " paragraph",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "paragraph",
+        original: "paragraph",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsordered_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsordered_list.snap
@ -0,0 +1,36 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"1. first\\n2. second\\n3. third\")"
+---
+[
+    Token {
+        normalized: "1. first",
+        original: "1. first",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "2. second",
+        original: "2. second",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "3. third",
+        original: "3. third",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsordered_task_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsordered_task_list.snap
@ -0,0 +1,48 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"1. [ ] first task\\n2. [x] second task\")"
+---
+[
+    Token {
+        normalized: "1. [ ] first",
+        original: "1. [ ] first",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " task",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "task",
+        original: "task",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "2. [x] second",
+        original: "2. [x] second",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " task",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "task",
+        original: "task",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsplain_text.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsplain_text.snap
@ -0,0 +1,24 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"Hello world\")"
+---
+[
+    Token {
+        normalized: "Hello",
+        original: "Hello",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " world",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "world",
+        original: "world",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizerteststask_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizerteststask_list.snap
@ -0,0 +1,48 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"- [ ] todo\\n- [x] done\\n- [X] also done\")"
+---
+[
+    Token {
+        normalized: "- [ ] todo",
+        original: "- [ ] todo",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- [x] done",
+        original: "- [x] done",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- [X] also",
+        original: "- [X] also",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " done",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "done",
+        original: "done",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsunicode.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsunicode.snap
@ -0,0 +1,60 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"# \\u{1F600} Héllo\\n- \\u{00E9}lément\\n> \\u{4F60}\\u{597D} world\")"
+---
+[
+    Token {
+        normalized: "# 😀",
+        original: "# 😀",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " Héllo",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "Héllo",
+        original: "Héllo",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- élément",
+        original: "- élément",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "> 你好",
+        original: "> 你好",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " world",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "world",
+        original: "world",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsunordered_list.snap
+++ b/src/tokenizer/snapshots/reconcile_texttokenizermarkdown_tokenizertestsunordered_list.snap
@ -0,0 +1,72 @@
+---
+source: src/tokenizer/markdown_tokenizer.rs
+expression: "markdown_tokenizer(\"- item one\\n- item two\\n- item three\")"
+---
+[
+    Token {
+        normalized: "- item",
+        original: "- item",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " one",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "one",
+        original: "one",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- item",
+        original: "- item",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " two",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "two",
+        original: "two",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "\n",
+        original: "\n",
+        is_left_joinable: false,
+        is_right_joinable: false,
+    },
+    Token {
+        normalized: "- item",
+        original: "- item",
+        is_left_joinable: false,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: " three",
+        original: " ",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+    Token {
+        normalized: "three",
+        original: "three",
+        is_left_joinable: true,
+        is_right_joinable: true,
+    },
+]
--- a/src/tokenizer/word_tokenizer.rs
+++ b/src/tokenizer/word_tokenizer.rs
@ -9,6 +9,26 @@ use super::token::Token;
 /// "Hi there!" -> ["Hi", " ", "there!"]
 /// ```
 pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result = split_words(text);
+
+    if result.is_empty() {
+        return result;
+    }
+
+    // normalize whitespace tokens by concatenating with the following token
+    for i in 0..result.len() - 1 {
+        if result[i].original().chars().all(char::is_whitespace) {
+            let normalized = result[i].normalized().to_owned() + result[i + 1].original();
+            result[i].set_normalized(normalized);
+        }
+    }
+
+    result
+}
+
+/// Splits text into alternating word and whitespace tokens without any
+/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`.
+pub(super) fn split_words(text: &str) -> Vec<Token<String>> {
    let mut result = Vec::new();

    let mut previous_boundary_index = 0;
@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
        result.push(text[previous_boundary_index..].into());
    }

-    if result.is_empty() {
-        return result;
-    }
-
-    // normalize whitespace tokens by concatenating with the following token
-    for i in 0..result.len() - 1 {
-        if result[i].original().chars().all(char::is_whitespace) {
-            let normalized = result[i].normalized().to_owned() + result[i + 1].original();
-            result[i].set_normalized(normalized);
-        }
-    }
-
    result
 }
Author	SHA1	Message	Date
Andras Schmelczer	09b5c606ea	Update tests	2026-03-11 20:43:41 +00:00
Andras Schmelczer	5978f73c97	Minimise allocations	2026-03-11 20:43:34 +00:00
Andras Schmelczer	bbe3b7573a	Update website	2026-03-11 20:39:36 +00:00
Andras Schmelczer	9da5bf6e3e	Add snapshots	2026-03-11 20:39:14 +00:00
Andras Schmelczer	446bbdfe5d	Implement makrdown tokeniser	2026-03-11 20:39:04 +00:00