Implement makrdown tokeniser

2026-03-11 20:39:04 +00:00 · 2026-03-11 20:39:04 +00:00 · 9a82d6d8dd
commit 9a82d6d8dd
parent bd2ef79fb1
2 changed files with 310 additions and 12 deletions
--- a/src/tokenizer/markdown_tokenizer.rs
+++ b/src/tokenizer/markdown_tokenizer.rs
@ -0,0 +1,290 @@
+use super::{token::Token, word_tokenizer::split_words};
+
+/// Splits markdown text into tokens that respect markdown formatting structure
+///
+/// Builds on word-level tokenization with markdown-specific handling:
+/// - Newlines are non-joinable tokens (preserves block structure)
+/// - Block-level prefixes (headings, list markers, blockquotes) attach to the
+///   first word of their line so they can't be split apart during merge
+/// - Intra-line whitespace uses the same normalization as the word tokenizer
+///
+/// This prevents merges from breaking lists, headings, or other structural
+/// markdown elements. Inline formatting like `**bold**` is already preserved
+/// by word-level splitting since formatting markers contain no whitespace.
+///
+/// ## Example
+///
+/// ```not_rust
+/// "# Hello\n- item" -> ["# Hello", "\n", "- item"]
+/// ```
+pub fn markdown_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result = Vec::new();
+    let segments = split_preserving_newlines(text);
+
+    for segment in &segments {
+        if *segment == "\n" || *segment == "\r\n" {
+            let s = (*segment).to_owned();
+            result.push(Token::new(s.clone(), s, false, false));
+            continue;
+        }
+
+        let prefix_len = block_prefix_len(segment);
+        let mut line_tokens = split_words(&segment[prefix_len..]);
+
+        if prefix_len > 0 {
+            let prefix = &segment[..prefix_len];
+            if line_tokens.is_empty() {
+                let s = prefix.to_owned();
+                result.push(Token::new(s.clone(), s, false, false));
+            } else {
+                let first = &line_tokens[0];
+                let combined_original = format!("{prefix}{}", first.original());
+                let combined_normalized = format!("{prefix}{}", first.normalized());
+                line_tokens[0] = Token::new(
+                    combined_normalized,
+                    combined_original,
+                    false,
+                    first.is_right_joinable,
+                );
+            }
+        }
+
+        result.extend(line_tokens);
+    }
+
+    // Normalize non-newline whitespace tokens by appending the next token's
+    // original text (same trick as the word tokenizer so each space is unique
+    // in the diff based on what follows it)
+    if !result.is_empty() {
+        for i in 0..result.len() - 1 {
+            if result[i]
+                .original()
+                .chars()
+                .all(|c| c.is_whitespace() && c != '\n' && c != '\r')
+            {
+                let normalized = result[i].normalized().to_owned() + result[i + 1].original();
+                result[i].set_normalized(normalized);
+            }
+        }
+    }
+
+    result
+}
+
+/// Splits text into alternating segments of line content and newline separators
+fn split_preserving_newlines(text: &str) -> Vec<&str> {
+    let mut segments = Vec::new();
+    let mut line_start = 0;
+    let bytes = text.as_bytes();
+    let mut i = 0;
+
+    while i < bytes.len() {
+        if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
+            if i > line_start {
+                segments.push(&text[line_start..i]);
+            }
+            segments.push(&text[i..i + 2]);
+            i += 2;
+            line_start = i;
+        } else if bytes[i] == b'\n' {
+            if i > line_start {
+                segments.push(&text[line_start..i]);
+            }
+            segments.push(&text[i..=i]);
+            i += 1;
+            line_start = i;
+        } else {
+            i += 1;
+        }
+    }
+
+    if line_start < text.len() {
+        segments.push(&text[line_start..]);
+    }
+
+    segments
+}
+
+/// Returns the byte length of a markdown block-level prefix at the start of a
+/// line, or 0 if none is found
+///
+/// All recognized prefix characters are ASCII, so byte offsets are always
+/// valid UTF-8 boundaries.
+///
+/// Recognized prefixes:
+/// - ATX headings: `# ` through `###### `
+/// - Blockquotes: `> ` (single level)
+/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace)
+/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace)
+/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix)
+fn block_prefix_len(line: &str) -> usize {
+    let trimmed = line.trim_start_matches([' ', '\t']);
+    let indent_len = line.len() - trimmed.len();
+
+    // ATX heading: #{1,6} followed by a space
+    if trimmed.starts_with('#') {
+        let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count();
+        if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') {
+            return indent_len + hash_count + 1;
+        }
+    }
+
+    // Blockquote: > followed by optional space
+    if trimmed.starts_with("> ") {
+        return indent_len + 2;
+    }
+    if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') {
+        return indent_len + 1;
+    }
+
+    // Unordered list: [-*+] followed by a space, optionally with task checkbox
+    if trimmed.len() >= 2 {
+        let first_byte = trimmed.as_bytes()[0];
+        if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' {
+            return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]);
+        }
+    }
+
+    // Ordered list: digits followed by [.)] and a space, optionally with task
+    // checkbox
+    let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count();
+    if digit_count > 0 && indent_len + digit_count + 2 <= line.len() {
+        let after_digits = trimmed.as_bytes()[digit_count];
+        let after_marker = trimmed.as_bytes().get(digit_count + 1);
+        if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') {
+            return indent_len
+                + digit_count
+                + 2
+                + task_checkbox_len(&line[indent_len + digit_count + 2..]);
+        }
+    }
+
+    0
+}
+
+/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `)
+/// at the start of `rest`, or 0 if none is found
+fn task_checkbox_len(rest: &str) -> usize {
+    if rest.len() >= 4
+        && rest.as_bytes()[0] == b'['
+        && matches!(rest.as_bytes()[1], b' ' | b'x' | b'X')
+        && rest.as_bytes()[2] == b']'
+        && rest.as_bytes()[3] == b' '
+    {
+        4
+    } else {
+        0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_plain_text() {
+        assert_debug_snapshot!(markdown_tokenizer("Hello world"));
+    }
+
+    #[test]
+    fn test_empty() {
+        assert_debug_snapshot!(markdown_tokenizer(""));
+    }
+
+    #[test]
+    fn test_headings() {
+        assert_debug_snapshot!(markdown_tokenizer("# Hello world"));
+        assert_debug_snapshot!(markdown_tokenizer("## Sub heading"));
+        assert_debug_snapshot!(markdown_tokenizer("###### Deep heading"));
+    }
+
+    #[test]
+    fn test_unordered_list() {
+        assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three"));
+    }
+
+    #[test]
+    fn test_ordered_list() {
+        assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third"));
+    }
+
+    #[test]
+    fn test_blockquote() {
+        assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted"));
+    }
+
+    #[test]
+    fn test_inline_formatting() {
+        assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text"));
+    }
+
+    #[test]
+    fn test_mixed_content() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "# Title\n\nSome text with **bold**.\n\n- list item\n- another item"
+        ));
+    }
+
+    #[test]
+    fn test_indented_list() {
+        assert_debug_snapshot!(markdown_tokenizer("  - nested item\n    - deeper"));
+    }
+
+    #[test]
+    fn test_crlf() {
+        assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2"));
+    }
+
+    #[test]
+    fn test_code_fence() {
+        assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```"));
+    }
+
+    #[test]
+    fn test_heading_only() {
+        assert_debug_snapshot!(markdown_tokenizer("# "));
+    }
+
+    #[test]
+    fn test_link() {
+        assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now"));
+    }
+
+    #[test]
+    fn test_multiline_paragraph() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "First line\nSecond line\n\nNew paragraph"
+        ));
+    }
+
+    #[test]
+    fn test_list_with_star_marker() {
+        assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two"));
+    }
+
+    #[test]
+    fn test_bold_not_confused_with_list() {
+        assert_debug_snapshot!(markdown_tokenizer("**bold text**"));
+    }
+
+    #[test]
+    fn test_task_list() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "- [ ] todo\n- [x] done\n- [X] also done"
+        ));
+    }
+
+    #[test]
+    fn test_ordered_task_list() {
+        assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task"));
+    }
+
+    #[test]
+    fn test_unicode() {
+        assert_debug_snapshot!(markdown_tokenizer(
+            "# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world"
+        ));
+    }
+}
--- a/src/tokenizer/word_tokenizer.rs
+++ b/src/tokenizer/word_tokenizer.rs
@ -9,6 +9,26 @@ use super::token::Token;
 /// "Hi there!" -> ["Hi", " ", "there!"]
 /// ```
 pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
+    let mut result = split_words(text);
+
+    if result.is_empty() {
+        return result;
+    }
+
+    // normalize whitespace tokens by concatenating with the following token
+    for i in 0..result.len() - 1 {
+        if result[i].original().chars().all(char::is_whitespace) {
+            let normalized = result[i].normalized().to_owned() + result[i + 1].original();
+            result[i].set_normalized(normalized);
+        }
+    }
+
+    result
+}
+
+/// Splits text into alternating word and whitespace tokens without any
+/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`.
+pub(super) fn split_words(text: &str) -> Vec<Token<String>> {
    let mut result = Vec::new();

    let mut previous_boundary_index = 0;
@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
        result.push(text[previous_boundary_index..].into());
    }

-    if result.is_empty() {
-        return result;
-    }
-
-    // normalize whitespace tokens by concatenating with the following token
-    for i in 0..result.len() - 1 {
-        if result[i].original().chars().all(char::is_whitespace) {
-            let normalized = result[i].normalized().to_owned() + result[i + 1].original();
-            result[i].set_normalized(normalized);
-        }
-    }
-
    result
 }