290 lines
8.9 KiB
Rust
290 lines
8.9 KiB
Rust
use super::{token::Token, word_tokenizer::split_words};
|
|
|
|
/// Splits markdown text into tokens that respect markdown formatting structure
|
|
///
|
|
/// Builds on word-level tokenization with markdown-specific handling:
|
|
/// - Newlines are non-joinable tokens (preserves block structure)
|
|
/// - Block-level prefixes (headings, list markers, blockquotes) attach to the
|
|
/// first word of their line so they can't be split apart during merge
|
|
/// - Intra-line whitespace uses the same normalization as the word tokenizer
|
|
///
|
|
/// This prevents merges from breaking lists, headings, or other structural
|
|
/// markdown elements. Inline formatting like `**bold**` is already preserved
|
|
/// by word-level splitting since formatting markers contain no whitespace.
|
|
///
|
|
/// ## Example
|
|
///
|
|
/// ```not_rust
|
|
/// "# Hello\n- item" -> ["# Hello", "\n", "- item"]
|
|
/// ```
|
|
pub fn markdown_tokenizer(text: &str) -> Vec<Token<String>> {
|
|
let mut result = Vec::new();
|
|
let segments = split_preserving_newlines(text);
|
|
|
|
for segment in &segments {
|
|
if *segment == "\n" || *segment == "\r\n" {
|
|
let s = (*segment).to_owned();
|
|
result.push(Token::new(s.clone(), s, false, false));
|
|
continue;
|
|
}
|
|
|
|
let prefix_len = block_prefix_len(segment);
|
|
let mut line_tokens = split_words(&segment[prefix_len..]);
|
|
|
|
if prefix_len > 0 {
|
|
let prefix = &segment[..prefix_len];
|
|
if line_tokens.is_empty() {
|
|
let s = prefix.to_owned();
|
|
result.push(Token::new(s.clone(), s, false, false));
|
|
} else {
|
|
let first = &line_tokens[0];
|
|
let combined_original = format!("{prefix}{}", first.original());
|
|
let combined_normalized = format!("{prefix}{}", first.normalized());
|
|
line_tokens[0] = Token::new(
|
|
combined_normalized,
|
|
combined_original,
|
|
false,
|
|
first.is_right_joinable,
|
|
);
|
|
}
|
|
}
|
|
|
|
result.extend(line_tokens);
|
|
}
|
|
|
|
// Normalize non-newline whitespace tokens by appending the next token's
|
|
// original text (same trick as the word tokenizer so each space is unique
|
|
// in the diff based on what follows it)
|
|
if !result.is_empty() {
|
|
for i in 0..result.len() - 1 {
|
|
if result[i]
|
|
.original()
|
|
.chars()
|
|
.all(|c| c.is_whitespace() && c != '\n' && c != '\r')
|
|
{
|
|
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
|
|
result[i].set_normalized(normalized);
|
|
}
|
|
}
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Splits text into alternating segments of line content and newline separators
|
|
fn split_preserving_newlines(text: &str) -> Vec<&str> {
|
|
let mut segments = Vec::new();
|
|
let mut line_start = 0;
|
|
let bytes = text.as_bytes();
|
|
let mut i = 0;
|
|
|
|
while i < bytes.len() {
|
|
if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
|
|
if i > line_start {
|
|
segments.push(&text[line_start..i]);
|
|
}
|
|
segments.push(&text[i..i + 2]);
|
|
i += 2;
|
|
line_start = i;
|
|
} else if bytes[i] == b'\n' {
|
|
if i > line_start {
|
|
segments.push(&text[line_start..i]);
|
|
}
|
|
segments.push(&text[i..=i]);
|
|
i += 1;
|
|
line_start = i;
|
|
} else {
|
|
i += 1;
|
|
}
|
|
}
|
|
|
|
if line_start < text.len() {
|
|
segments.push(&text[line_start..]);
|
|
}
|
|
|
|
segments
|
|
}
|
|
|
|
/// Returns the byte length of a markdown block-level prefix at the start of a
|
|
/// line, or 0 if none is found
|
|
///
|
|
/// All recognized prefix characters are ASCII, so byte offsets are always
|
|
/// valid UTF-8 boundaries.
|
|
///
|
|
/// Recognized prefixes:
|
|
/// - ATX headings: `# ` through `###### `
|
|
/// - Blockquotes: `> ` (single level)
|
|
/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace)
|
|
/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace)
|
|
/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix)
|
|
fn block_prefix_len(line: &str) -> usize {
|
|
let trimmed = line.trim_start_matches([' ', '\t']);
|
|
let indent_len = line.len() - trimmed.len();
|
|
|
|
// ATX heading: #{1,6} followed by a space
|
|
if trimmed.starts_with('#') {
|
|
let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count();
|
|
if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') {
|
|
return indent_len + hash_count + 1;
|
|
}
|
|
}
|
|
|
|
// Blockquote: > followed by optional space
|
|
if trimmed.starts_with("> ") {
|
|
return indent_len + 2;
|
|
}
|
|
if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') {
|
|
return indent_len + 1;
|
|
}
|
|
|
|
// Unordered list: [-*+] followed by a space, optionally with task checkbox
|
|
if trimmed.len() >= 2 {
|
|
let first_byte = trimmed.as_bytes()[0];
|
|
if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' {
|
|
return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]);
|
|
}
|
|
}
|
|
|
|
// Ordered list: digits followed by [.)] and a space, optionally with task
|
|
// checkbox
|
|
let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count();
|
|
if digit_count > 0 && indent_len + digit_count + 2 <= line.len() {
|
|
let after_digits = trimmed.as_bytes()[digit_count];
|
|
let after_marker = trimmed.as_bytes().get(digit_count + 1);
|
|
if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') {
|
|
return indent_len
|
|
+ digit_count
|
|
+ 2
|
|
+ task_checkbox_len(&line[indent_len + digit_count + 2..]);
|
|
}
|
|
}
|
|
|
|
0
|
|
}
|
|
|
|
/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `)
|
|
/// at the start of `rest`, or 0 if none is found
|
|
fn task_checkbox_len(rest: &str) -> usize {
|
|
if rest.len() >= 4
|
|
&& rest.as_bytes()[0] == b'['
|
|
&& matches!(rest.as_bytes()[1], b' ' | b'x' | b'X')
|
|
&& rest.as_bytes()[2] == b']'
|
|
&& rest.as_bytes()[3] == b' '
|
|
{
|
|
4
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use insta::assert_debug_snapshot;
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_plain_text() {
|
|
assert_debug_snapshot!(markdown_tokenizer("Hello world"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty() {
|
|
assert_debug_snapshot!(markdown_tokenizer(""));
|
|
}
|
|
|
|
#[test]
|
|
fn test_headings() {
|
|
assert_debug_snapshot!(markdown_tokenizer("# Hello world"));
|
|
assert_debug_snapshot!(markdown_tokenizer("## Sub heading"));
|
|
assert_debug_snapshot!(markdown_tokenizer("###### Deep heading"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_unordered_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_ordered_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_blockquote() {
|
|
assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_inline_formatting() {
|
|
assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_mixed_content() {
|
|
assert_debug_snapshot!(markdown_tokenizer(
|
|
"# Title\n\nSome text with **bold**.\n\n- list item\n- another item"
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn test_indented_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_crlf() {
|
|
assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_code_fence() {
|
|
assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_heading_only() {
|
|
assert_debug_snapshot!(markdown_tokenizer("# "));
|
|
}
|
|
|
|
#[test]
|
|
fn test_link() {
|
|
assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_multiline_paragraph() {
|
|
assert_debug_snapshot!(markdown_tokenizer(
|
|
"First line\nSecond line\n\nNew paragraph"
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn test_list_with_star_marker() {
|
|
assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_bold_not_confused_with_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer("**bold text**"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_task_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer(
|
|
"- [ ] todo\n- [x] done\n- [X] also done"
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn test_ordered_task_list() {
|
|
assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_unicode() {
|
|
assert_debug_snapshot!(markdown_tokenizer(
|
|
"# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world"
|
|
));
|
|
}
|
|
}
|