use super::{token::Token, word_tokenizer::split_words}; /// Splits markdown text into tokens that respect markdown formatting structure /// /// Builds on word-level tokenization with markdown-specific handling: /// - Newlines are non-joinable tokens (preserves block structure) /// - Block-level prefixes (headings, list markers, blockquotes) attach to the /// first word of their line so they can't be split apart during merge /// - Intra-line whitespace uses the same normalization as the word tokenizer /// /// This prevents merges from breaking lists, headings, or other structural /// markdown elements. Inline formatting like `**bold**` is already preserved /// by word-level splitting since formatting markers contain no whitespace. /// /// ## Example /// /// ```not_rust /// "# Hello\n- item" -> ["# Hello", "\n", "- item"] /// ``` pub fn markdown_tokenizer(text: &str) -> Vec> { let mut result = Vec::new(); let segments = split_preserving_newlines(text); for segment in &segments { if *segment == "\n" || *segment == "\r\n" { let s = (*segment).to_owned(); result.push(Token::new(s.clone(), s, false, false)); continue; } let prefix_len = block_prefix_len(segment); let mut line_tokens = split_words(&segment[prefix_len..]); if prefix_len > 0 { let prefix = &segment[..prefix_len]; if line_tokens.is_empty() { let s = prefix.to_owned(); result.push(Token::new(s.clone(), s, false, false)); } else { let first = &line_tokens[0]; let combined_original = format!("{prefix}{}", first.original()); let combined_normalized = format!("{prefix}{}", first.normalized()); line_tokens[0] = Token::new( combined_normalized, combined_original, false, first.is_right_joinable, ); } } result.extend(line_tokens); } // Normalize non-newline whitespace tokens by appending the next token's // original text (same trick as the word tokenizer so each space is unique // in the diff based on what follows it) if !result.is_empty() { for i in 0..result.len() - 1 { if result[i] .original() .chars() .all(|c| c.is_whitespace() && c != '\n' && c != '\r') { let normalized = result[i].normalized().to_owned() + result[i + 1].original(); result[i].set_normalized(normalized); } } } result } /// Splits text into alternating segments of line content and newline separators fn split_preserving_newlines(text: &str) -> Vec<&str> { let mut segments = Vec::new(); let mut line_start = 0; let bytes = text.as_bytes(); let mut i = 0; while i < bytes.len() { if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' { if i > line_start { segments.push(&text[line_start..i]); } segments.push(&text[i..i + 2]); i += 2; line_start = i; } else if bytes[i] == b'\n' { if i > line_start { segments.push(&text[line_start..i]); } segments.push(&text[i..=i]); i += 1; line_start = i; } else { i += 1; } } if line_start < text.len() { segments.push(&text[line_start..]); } segments } /// Returns the byte length of a markdown block-level prefix at the start of a /// line, or 0 if none is found /// /// All recognized prefix characters are ASCII, so byte offsets are always /// valid UTF-8 boundaries. /// /// Recognized prefixes: /// - ATX headings: `# ` through `###### ` /// - Blockquotes: `> ` (single level) /// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace) /// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace) /// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix) fn block_prefix_len(line: &str) -> usize { let trimmed = line.trim_start_matches([' ', '\t']); let indent_len = line.len() - trimmed.len(); // ATX heading: #{1,6} followed by a space if trimmed.starts_with('#') { let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count(); if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') { return indent_len + hash_count + 1; } } // Blockquote: > followed by optional space if trimmed.starts_with("> ") { return indent_len + 2; } if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') { return indent_len + 1; } // Unordered list: [-*+] followed by a space, optionally with task checkbox if trimmed.len() >= 2 { let first_byte = trimmed.as_bytes()[0]; if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' { return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]); } } // Ordered list: digits followed by [.)] and a space, optionally with task // checkbox let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count(); if digit_count > 0 && indent_len + digit_count + 2 <= line.len() { let after_digits = trimmed.as_bytes()[digit_count]; let after_marker = trimmed.as_bytes().get(digit_count + 1); if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') { return indent_len + digit_count + 2 + task_checkbox_len(&line[indent_len + digit_count + 2..]); } } 0 } /// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `) /// at the start of `rest`, or 0 if none is found fn task_checkbox_len(rest: &str) -> usize { if rest.len() >= 4 && rest.as_bytes()[0] == b'[' && matches!(rest.as_bytes()[1], b' ' | b'x' | b'X') && rest.as_bytes()[2] == b']' && rest.as_bytes()[3] == b' ' { 4 } else { 0 } } #[cfg(test)] mod tests { use insta::assert_debug_snapshot; use super::*; #[test] fn test_plain_text() { assert_debug_snapshot!(markdown_tokenizer("Hello world")); } #[test] fn test_empty() { assert_debug_snapshot!(markdown_tokenizer("")); } #[test] fn test_headings() { assert_debug_snapshot!(markdown_tokenizer("# Hello world")); assert_debug_snapshot!(markdown_tokenizer("## Sub heading")); assert_debug_snapshot!(markdown_tokenizer("###### Deep heading")); } #[test] fn test_unordered_list() { assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three")); } #[test] fn test_ordered_list() { assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third")); } #[test] fn test_blockquote() { assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted")); } #[test] fn test_inline_formatting() { assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text")); } #[test] fn test_mixed_content() { assert_debug_snapshot!(markdown_tokenizer( "# Title\n\nSome text with **bold**.\n\n- list item\n- another item" )); } #[test] fn test_indented_list() { assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper")); } #[test] fn test_crlf() { assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2")); } #[test] fn test_code_fence() { assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```")); } #[test] fn test_heading_only() { assert_debug_snapshot!(markdown_tokenizer("# ")); } #[test] fn test_link() { assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now")); } #[test] fn test_multiline_paragraph() { assert_debug_snapshot!(markdown_tokenizer( "First line\nSecond line\n\nNew paragraph" )); } #[test] fn test_list_with_star_marker() { assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two")); } #[test] fn test_bold_not_confused_with_list() { assert_debug_snapshot!(markdown_tokenizer("**bold text**")); } #[test] fn test_task_list() { assert_debug_snapshot!(markdown_tokenizer( "- [ ] todo\n- [x] done\n- [X] also done" )); } #[test] fn test_ordered_task_list() { assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task")); } #[test] fn test_unicode() { assert_debug_snapshot!(markdown_tokenizer( "# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world" )); } }