diff --git a/src/tokenizer/markdown_tokenizer.rs b/src/tokenizer/markdown_tokenizer.rs new file mode 100644 index 0000000..b69df8f --- /dev/null +++ b/src/tokenizer/markdown_tokenizer.rs @@ -0,0 +1,290 @@ +use super::{token::Token, word_tokenizer::split_words}; + +/// Splits markdown text into tokens that respect markdown formatting structure +/// +/// Builds on word-level tokenization with markdown-specific handling: +/// - Newlines are non-joinable tokens (preserves block structure) +/// - Block-level prefixes (headings, list markers, blockquotes) attach to the +/// first word of their line so they can't be split apart during merge +/// - Intra-line whitespace uses the same normalization as the word tokenizer +/// +/// This prevents merges from breaking lists, headings, or other structural +/// markdown elements. Inline formatting like `**bold**` is already preserved +/// by word-level splitting since formatting markers contain no whitespace. +/// +/// ## Example +/// +/// ```not_rust +/// "# Hello\n- item" -> ["# Hello", "\n", "- item"] +/// ``` +pub fn markdown_tokenizer(text: &str) -> Vec> { + let mut result = Vec::new(); + let segments = split_preserving_newlines(text); + + for segment in &segments { + if *segment == "\n" || *segment == "\r\n" { + let s = (*segment).to_owned(); + result.push(Token::new(s.clone(), s, false, false)); + continue; + } + + let prefix_len = block_prefix_len(segment); + let mut line_tokens = split_words(&segment[prefix_len..]); + + if prefix_len > 0 { + let prefix = &segment[..prefix_len]; + if line_tokens.is_empty() { + let s = prefix.to_owned(); + result.push(Token::new(s.clone(), s, false, false)); + } else { + let first = &line_tokens[0]; + let combined_original = format!("{prefix}{}", first.original()); + let combined_normalized = format!("{prefix}{}", first.normalized()); + line_tokens[0] = Token::new( + combined_normalized, + combined_original, + false, + first.is_right_joinable, + ); + } + } + + result.extend(line_tokens); + } + + // Normalize non-newline whitespace tokens by appending the next token's + // original text (same trick as the word tokenizer so each space is unique + // in the diff based on what follows it) + if !result.is_empty() { + for i in 0..result.len() - 1 { + if result[i] + .original() + .chars() + .all(|c| c.is_whitespace() && c != '\n' && c != '\r') + { + let normalized = result[i].normalized().to_owned() + result[i + 1].original(); + result[i].set_normalized(normalized); + } + } + } + + result +} + +/// Splits text into alternating segments of line content and newline separators +fn split_preserving_newlines(text: &str) -> Vec<&str> { + let mut segments = Vec::new(); + let mut line_start = 0; + let bytes = text.as_bytes(); + let mut i = 0; + + while i < bytes.len() { + if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' { + if i > line_start { + segments.push(&text[line_start..i]); + } + segments.push(&text[i..i + 2]); + i += 2; + line_start = i; + } else if bytes[i] == b'\n' { + if i > line_start { + segments.push(&text[line_start..i]); + } + segments.push(&text[i..=i]); + i += 1; + line_start = i; + } else { + i += 1; + } + } + + if line_start < text.len() { + segments.push(&text[line_start..]); + } + + segments +} + +/// Returns the byte length of a markdown block-level prefix at the start of a +/// line, or 0 if none is found +/// +/// All recognized prefix characters are ASCII, so byte offsets are always +/// valid UTF-8 boundaries. +/// +/// Recognized prefixes: +/// - ATX headings: `# ` through `###### ` +/// - Blockquotes: `> ` (single level) +/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace) +/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace) +/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix) +fn block_prefix_len(line: &str) -> usize { + let trimmed = line.trim_start_matches([' ', '\t']); + let indent_len = line.len() - trimmed.len(); + + // ATX heading: #{1,6} followed by a space + if trimmed.starts_with('#') { + let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count(); + if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') { + return indent_len + hash_count + 1; + } + } + + // Blockquote: > followed by optional space + if trimmed.starts_with("> ") { + return indent_len + 2; + } + if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') { + return indent_len + 1; + } + + // Unordered list: [-*+] followed by a space, optionally with task checkbox + if trimmed.len() >= 2 { + let first_byte = trimmed.as_bytes()[0]; + if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' { + return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]); + } + } + + // Ordered list: digits followed by [.)] and a space, optionally with task + // checkbox + let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count(); + if digit_count > 0 && indent_len + digit_count + 2 <= line.len() { + let after_digits = trimmed.as_bytes()[digit_count]; + let after_marker = trimmed.as_bytes().get(digit_count + 1); + if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') { + return indent_len + + digit_count + + 2 + + task_checkbox_len(&line[indent_len + digit_count + 2..]); + } + } + + 0 +} + +/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `) +/// at the start of `rest`, or 0 if none is found +fn task_checkbox_len(rest: &str) -> usize { + if rest.len() >= 4 + && rest.as_bytes()[0] == b'[' + && matches!(rest.as_bytes()[1], b' ' | b'x' | b'X') + && rest.as_bytes()[2] == b']' + && rest.as_bytes()[3] == b' ' + { + 4 + } else { + 0 + } +} + +#[cfg(test)] +mod tests { + use insta::assert_debug_snapshot; + + use super::*; + + #[test] + fn test_plain_text() { + assert_debug_snapshot!(markdown_tokenizer("Hello world")); + } + + #[test] + fn test_empty() { + assert_debug_snapshot!(markdown_tokenizer("")); + } + + #[test] + fn test_headings() { + assert_debug_snapshot!(markdown_tokenizer("# Hello world")); + assert_debug_snapshot!(markdown_tokenizer("## Sub heading")); + assert_debug_snapshot!(markdown_tokenizer("###### Deep heading")); + } + + #[test] + fn test_unordered_list() { + assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three")); + } + + #[test] + fn test_ordered_list() { + assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third")); + } + + #[test] + fn test_blockquote() { + assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted")); + } + + #[test] + fn test_inline_formatting() { + assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text")); + } + + #[test] + fn test_mixed_content() { + assert_debug_snapshot!(markdown_tokenizer( + "# Title\n\nSome text with **bold**.\n\n- list item\n- another item" + )); + } + + #[test] + fn test_indented_list() { + assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper")); + } + + #[test] + fn test_crlf() { + assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2")); + } + + #[test] + fn test_code_fence() { + assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```")); + } + + #[test] + fn test_heading_only() { + assert_debug_snapshot!(markdown_tokenizer("# ")); + } + + #[test] + fn test_link() { + assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now")); + } + + #[test] + fn test_multiline_paragraph() { + assert_debug_snapshot!(markdown_tokenizer( + "First line\nSecond line\n\nNew paragraph" + )); + } + + #[test] + fn test_list_with_star_marker() { + assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two")); + } + + #[test] + fn test_bold_not_confused_with_list() { + assert_debug_snapshot!(markdown_tokenizer("**bold text**")); + } + + #[test] + fn test_task_list() { + assert_debug_snapshot!(markdown_tokenizer( + "- [ ] todo\n- [x] done\n- [X] also done" + )); + } + + #[test] + fn test_ordered_task_list() { + assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task")); + } + + #[test] + fn test_unicode() { + assert_debug_snapshot!(markdown_tokenizer( + "# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world" + )); + } +} diff --git a/src/tokenizer/word_tokenizer.rs b/src/tokenizer/word_tokenizer.rs index e486825..f3f146b 100644 --- a/src/tokenizer/word_tokenizer.rs +++ b/src/tokenizer/word_tokenizer.rs @@ -9,6 +9,26 @@ use super::token::Token; /// "Hi there!" -> ["Hi", " ", "there!"] /// ``` pub fn word_tokenizer(text: &str) -> Vec> { + let mut result = split_words(text); + + if result.is_empty() { + return result; + } + + // normalize whitespace tokens by concatenating with the following token + for i in 0..result.len() - 1 { + if result[i].original().chars().all(char::is_whitespace) { + let normalized = result[i].normalized().to_owned() + result[i + 1].original(); + result[i].set_normalized(normalized); + } + } + + result +} + +/// Splits text into alternating word and whitespace tokens without any +/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`. +pub(super) fn split_words(text: &str) -> Vec> { let mut result = Vec::new(); let mut previous_boundary_index = 0; @@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec> { result.push(text[previous_boundary_index..].into()); } - if result.is_empty() { - return result; - } - - // normalize whitespace tokens by concatenating with the following token - for i in 0..result.len() - 1 { - if result[i].original().chars().all(char::is_whitespace) { - let normalized = result[i].normalized().to_owned() + result[i + 1].original(); - result[i].set_normalized(normalized); - } - } - result }